diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/cache.json b/cache.json new file mode 100644 index 0000000..d7fc24b --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2025-03-03T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2501.09695v2","updated":"2025-03-03T14:48:45Z","published":"2025-01-16T17:48:03Z","title":"Mitigating Hallucinations in Large Vision-Language Models via DPO:\n On-Policy Data Hold the Key","summary":" Hallucination remains a major challenge for Large Vision-Language Models\n(LVLMs). Direct Preference Optimization (DPO) has gained increasing attention\nas a simple solution to hallucination issues. It directly learns from\nconstructed preference pairs that reflect the severity of hallucinations in\nresponses to the same prompt and image. Nonetheless, different data\nconstruction methods in existing works bring notable performance variations. We\nidentify a crucial factor here: outcomes are largely contingent on whether the\nconstructed data aligns on-policy w.r.t the initial (reference) policy of DPO.\nTheoretical analysis suggests that learning from off-policy data is impeded by\nthe presence of KL-divergence between the updated policy and the reference\npolicy. From the perspective of dataset distribution, we systematically\nsummarize the inherent flaws in existing algorithms that employ DPO to address\nhallucination issues. To alleviate the problems, we propose On-Policy Alignment\n(OPA)-DPO framework, which uniquely leverages expert feedback to correct\nhallucinated responses and aligns both the original and expert-revised\nresponses in an on-policy manner. Notably, with only 4.8k data, OPA-DPO\nachieves an additional reduction in the hallucination rate of LLaVA-1.5-7B:\n13.26% on the AMBER benchmark and 5.39% on the Object-Hal benchmark, compared\nto the previous SOTA algorithm trained with 16k samples. Our implementation is\navailable at https://github.com/zhyang2226/OPA-DPO.\n","authors":["Zhihe Yang","Xufang Luo","Dongqi Han","Yunjian Xu","Dongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2501.09695v2.pdf","comment":"Accepted by CVPR 2025"},{"id":"http://arxiv.org/abs/2412.07487v2","updated":"2025-03-03T14:04:23Z","published":"2024-12-10T13:12:32Z","title":"Stereo Hand-Object Reconstruction for Human-to-Robot Handover","summary":" Jointly estimating hand and object shape facilitates the grasping task in\nhuman-to-robot handovers. However, relying on hand-crafted prior knowledge\nabout the geometric structure of the object fails when generalising to unseen\nobjects, and depth sensors fail to detect transparent objects such as drinking\nglasses. In this work, we propose a stereo-based method for hand-object\nreconstruction that combines single-view reconstructions probabilistically to\nform a coherent stereo reconstruction. We learn 3D shape priors from a large\nsynthetic hand-object dataset to ensure that our method is generalisable, and\nuse RGB inputs to better capture transparent objects. We show that our method\nreduces the object Chamfer distance compared to existing RGB based hand-object\nreconstruction methods on single view and stereo settings. We process the\nreconstructed hand-object shape with a projection-based outlier removal step\nand use the output to guide a human-to-robot handover pipeline with\nwide-baseline stereo RGB cameras. Our hand-object reconstruction enables a\nrobot to successfully receive a diverse range of household objects from the\nhuman.\n","authors":["Yik Lung Pang","Alessio Xompero","Changjae Oh","Andrea Cavallaro"],"pdf_url":"https://arxiv.org/pdf/2412.07487v2.pdf","comment":"8 pages, 9 figures, 1 table"},{"id":"http://arxiv.org/abs/2412.02993v2","updated":"2025-03-03T13:59:01Z","published":"2024-12-04T03:19:43Z","title":"EchoONE: Segmenting Multiple echocardiography Planes in One Model","summary":" In clinical practice of echocardiography examinations, multiple planes\ncontaining the heart structures of different view are usually required in\nscreening, diagnosis and treatment of cardiac disease. AI models for\nechocardiography have to be tailored for each specific plane due to the\ndramatic structure differences, thus resulting in repetition development and\nextra complexity. Effective solution for such a multi-plane segmentation (MPS)\nproblem is highly demanded for medical images, yet has not been well\ninvestigated. In this paper, we propose a novel solution, EchoONE, for this\nproblem with a SAM-based segmentation architecture, a prior-composable mask\nlearning (PC-Mask) module for semantic-aware dense prompt generation, and a\nlearnable CNN-branch with a simple yet effective local feature fusion and\nadaption (LFFA) module for SAM adapting. We extensively evaluated our method on\nmultiple internal and external echocardiography datasets, and achieved\nconsistently state-of-the-art performance for multi-source datasets with\ndifferent heart planes. This is the first time that the MPS problem is solved\nin one model for echocardiography data. The code will be available at\nhttps://github.com/a2502503/EchoONE.\n","authors":["Jiongtong Hu","Wei Zhuo","Jun Cheng","Yingying Liu","Wufeng Xue","Dong Ni"],"pdf_url":"https://arxiv.org/pdf/2412.02993v2.pdf","comment":"Accepted by CVPR 2025"},{"id":"http://arxiv.org/abs/2502.18858v2","updated":"2025-03-03T13:38:50Z","published":"2025-02-26T05:59:45Z","title":"Evaluating Intelligence via Trial and Error","summary":" Intelligence is a crucial trait for species to find solutions within a\nlimited number of trial-and-error attempts. Building on this idea, we introduce\nSurvival Game as a framework to evaluate intelligence based on the number of\nfailed attempts in a trial-and-error process. Fewer failures indicate higher\nintelligence. When the expectation and variance of failure counts are both\nfinite, it signals the ability to consistently find solutions to new\nchallenges, which we define as the Autonomous Level of intelligence. Using\nSurvival Game, we comprehensively evaluate existing AI systems. Our results\nshow that while AI systems achieve the Autonomous Level in simple tasks, they\nare still far from it in more complex tasks, such as vision, search,\nrecommendation, and language. While scaling current AI technologies might help,\nthis would come at an astronomical cost. Projections suggest that achieving the\nAutonomous Level for general tasks would require $10^{26}$ parameters. To put\nthis into perspective, loading such a massive model requires so many H100 GPUs\nthat their total value is $10^{7}$ times that of Apple Inc.'s market value.\nEven with Moore's Law, supporting such a parameter scale would take $70$ years.\nThis staggering cost highlights the complexity of human tasks and the\ninadequacies of current AI technologies. To further investigate this\nphenomenon, we conduct a theoretical analysis of Survival Game and its\nexperimental results. Our findings suggest that human tasks possess a\ncriticality property. As a result, Autonomous Level requires a deep\nunderstanding of the task's underlying mechanisms. Current AI systems, however,\ndo not fully grasp these mechanisms and instead rely on superficial mimicry,\nmaking it difficult for them to reach an autonomous level. We believe Survival\nGame can not only guide the future development of AI but also offer profound\ninsights into human intelligence.\n","authors":["Jingtao Zhan","Jiahao Zhao","Jiayu Li","Yiqun Liu","Bo Zhang","Qingyao Ai","Jiaxin Mao","Hongning Wang","Min Zhang","Shaoping Ma"],"pdf_url":"https://arxiv.org/pdf/2502.18858v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15517v2","updated":"2025-03-03T13:22:14Z","published":"2024-09-23T20:09:43Z","title":"MATCH POLICY: A Simple Pipeline from Point Cloud Registration to\n Manipulation Policies","summary":" Many manipulation tasks require the robot to rearrange objects relative to\none another. Such tasks can be described as a sequence of relative poses\nbetween parts of a set of rigid bodies. In this work, we propose MATCH POLICY,\na simple but novel pipeline for solving high-precision pick and place tasks.\nInstead of predicting actions directly, our method registers the pick and place\ntargets to the stored demonstrations. This transfers action inference into a\npoint cloud registration task and enables us to realize nontrivial manipulation\npolicies without any training. MATCH POLICY is designed to solve high-precision\ntasks with a key-frame setting. By leveraging the geometric interaction and the\nsymmetries of the task, it achieves extremely high sample efficiency and\ngeneralizability to unseen configurations. We demonstrate its state-of-the-art\nperformance across various tasks on RLBench benchmark compared with several\nstrong baselines and test it on a real robot with six tasks.\n","authors":["Haojie Huang","Haotian Liu","Dian Wang","Robin Walters","Robert Platt"],"pdf_url":"https://arxiv.org/pdf/2409.15517v2.pdf","comment":"project url: https://haojhuang.github.io/match_page/"},{"id":"http://arxiv.org/abs/2409.20171v3","updated":"2025-03-03T13:12:48Z","published":"2024-09-30T10:29:41Z","title":"Annotation-Free Curb Detection Leveraging Altitude Difference Image","summary":" Road curbs are considered as one of the crucial and ubiquitous traffic\nfeatures, which are essential for ensuring the safety of autonomous vehicles.\nCurrent methods for detecting curbs primarily rely on camera imagery or LiDAR\npoint clouds. Image-based methods are vulnerable to fluctuations in lighting\nconditions and exhibit poor robustness, while methods based on point clouds\ncircumvent the issues associated with lighting variations. However, it is the\ntypical case that significant processing delays are encountered due to the\nvoluminous amount of 3D points contained in each frame of the point cloud data.\nFurthermore, the inherently unstructured characteristics of point clouds poses\nchallenges for integrating the latest deep learning advancements into point\ncloud data applications. To address these issues, this work proposes an\nannotation-free curb detection method leveraging Altitude Difference Image\n(ADI), which effectively mitigates the aforementioned challenges. Given that\nmethods based on deep learning generally demand extensive, manually annotated\ndatasets, which are both expensive and labor-intensive to create, we present an\nAutomatic Curb Annotator (ACA) module. This module utilizes a deterministic\ncurb detection algorithm to automatically generate a vast quantity of training\ndata. Consequently, it facilitates the training of the curb detection model\nwithout necessitating any manual annotation of data. Finally, by incorporating\na post-processing module, we manage to achieve state-of-the-art results on the\nKITTI 3D curb dataset with considerably reduced processing delays compared to\nexisting methods, which underscores the effectiveness of our approach in curb\ndetection tasks.\n","authors":["Fulong Ma","Peng Hou","Yuxuan Liu","Yang Liu","Ming Liu","Jun Ma"],"pdf_url":"https://arxiv.org/pdf/2409.20171v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09555v3","updated":"2025-03-03T13:05:35Z","published":"2025-01-16T14:18:06Z","title":"Text-driven Adaptation of Foundation Models for Few-shot Surgical\n Workflow Analysis","summary":" Purpose: Surgical workflow analysis is crucial for improving surgical\nefficiency and safety. However, previous studies rely heavily on large-scale\nannotated datasets, posing challenges in cost, scalability, and reliance on\nexpert annotations. To address this, we propose Surg-FTDA (Few-shot Text-driven\nAdaptation), designed to handle various surgical workflow analysis tasks with\nminimal paired image-label data.\n Methods: Our approach has two key components. First, Few-shot selection-based\nmodality alignment selects a small subset of images and aligns their embeddings\nwith text embeddings from the downstream task, bridging the modality gap.\nSecond, Text-driven adaptation leverages only text data to train a decoder,\neliminating the need for paired image-text data. This decoder is then applied\nto aligned image embeddings, enabling image-related tasks without explicit\nimage-text pairs.\n Results: We evaluate our approach to generative tasks (image captioning) and\ndiscriminative tasks (triplet recognition and phase recognition). Results show\nthat Surg-FTDA outperforms baselines and generalizes well across downstream\ntasks.\n Conclusion: We propose a text-driven adaptation approach that mitigates the\nmodality gap and handles multiple downstream tasks in surgical workflow\nanalysis, with minimal reliance on large annotated datasets. The code and\ndataset will be released in https://github.com/CAMMA-public/Surg-FTDA\n","authors":["Tingxuan Chen","Kun Yuan","Vinkle Srivastav","Nassir Navab","Nicolas Padoy"],"pdf_url":"https://arxiv.org/pdf/2501.09555v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.11142v2","updated":"2025-03-03T12:56:35Z","published":"2025-02-16T14:17:36Z","title":"NavRAG: Generating User Demand Instructions for Embodied Navigation\n through Retrieval-Augmented LLM","summary":" Vision-and-Language Navigation (VLN) is an essential skill for embodied\nagents, allowing them to navigate in 3D environments following natural language\ninstructions. High-performance navigation models require a large amount of\ntraining data, the high cost of manually annotating data has seriously hindered\nthis field. Therefore, some previous methods translate trajectory videos into\nstep-by-step instructions for expanding data, but such instructions do not\nmatch well with users' communication styles that briefly describe destinations\nor state specific needs. Moreover, local navigation trajectories overlook\nglobal context and high-level task planning. To address these issues, we\npropose NavRAG, a retrieval-augmented generation (RAG) framework that generates\nuser demand instructions for VLN. NavRAG leverages LLM to build a hierarchical\nscene description tree for 3D scene understanding from global layout to local\ndetails, then simulates various user roles with specific demands to retrieve\nfrom the scene tree, generating diverse instructions with LLM. We annotate over\n2 million navigation instructions across 861 scenes and evaluate the data\nquality and navigation performance of trained models.\n","authors":["Zihan Wang","Yaohui Zhu","Gim Hee Lee","Yachun Fan"],"pdf_url":"https://arxiv.org/pdf/2502.11142v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.14616v2","updated":"2025-03-03T12:37:18Z","published":"2025-02-20T14:57:01Z","title":"Monocular Depth Estimation and Segmentation for Transparent Object with\n Iterative Semantic and Geometric Fusion","summary":" Transparent object perception is indispensable for numerous robotic tasks.\nHowever, accurately segmenting and estimating the depth of transparent objects\nremain challenging due to complex optical properties. Existing methods\nprimarily delve into only one task using extra inputs or specialized sensors,\nneglecting the valuable interactions among tasks and the subsequent refinement\nprocess, leading to suboptimal and blurry predictions. To address these issues,\nwe propose a monocular framework, which is the first to excel in both\nsegmentation and depth estimation of transparent objects, with only a\nsingle-image input. Specifically, we devise a novel semantic and geometric\nfusion module, effectively integrating the multi-scale information between\ntasks. In addition, drawing inspiration from human perception of objects, we\nfurther incorporate an iterative strategy, which progressively refines initial\nfeatures for clearer results. Experiments on two challenging synthetic and\nreal-world datasets demonstrate that our model surpasses state-of-the-art\nmonocular, stereo, and multi-view methods by a large margin of about\n38.8%-46.2% with only a single RGB input. Codes and models are publicly\navailable at https://github.com/L-J-Yuan/MODEST.\n","authors":["Jiangyuan Liu","Hongxuan Ma","Yuxin Guo","Yuhao Zhao","Chi Zhang","Wei Sui","Wei Zou"],"pdf_url":"https://arxiv.org/pdf/2502.14616v2.pdf","comment":"Accepted by ICRA(2025). The code is accessible through:\n https://github.com/L-J-Yuan/MODEST"},{"id":"http://arxiv.org/abs/2408.04591v2","updated":"2025-03-03T12:35:33Z","published":"2024-08-08T17:04:06Z","title":"HiLo: A Learning Framework for Generalized Category Discovery Robust to\n Domain Shifts","summary":" Generalized Category Discovery (GCD) is a challenging task in which, given a\npartially labelled dataset, models must categorize all unlabelled instances,\nregardless of whether they come from labelled categories or from new ones. In\nthis paper, we challenge a remaining assumption in this task: that all images\nshare the same domain. Specifically, we introduce a new task and method to\nhandle GCD when the unlabelled data also contains images from different domains\nto the labelled set. Our proposed `HiLo' networks extract High-level semantic\nand Low-level domain features, before minimizing the mutual information between\nthe representations. Our intuition is that the clusterings based on domain\ninformation and semantic information should be independent. We further extend\nour method with a specialized domain augmentation tailored for the GCD task, as\nwell as a curriculum learning approach. Finally, we construct a benchmark from\ncorrupted fine-grained datasets as well as a large-scale evaluation on\nDomainNet with real-world domain shifts, reimplementing a number of GCD\nbaselines in this setting. We demonstrate that HiLo outperforms SoTA category\ndiscovery models by a large margin on all evaluations.\n","authors":["Hongjun Wang","Sagar Vaze","Kai Han"],"pdf_url":"https://arxiv.org/pdf/2408.04591v2.pdf","comment":"v2: Accepted as a conference paper at ICLR 2025; Project page:\n https://github.com/Visual-AI/hilo/"},{"id":"http://arxiv.org/abs/2410.09400v2","updated":"2025-03-03T12:33:49Z","published":"2024-10-12T07:04:32Z","title":"CtrLoRA: An Extensible and Efficient Framework for Controllable Image\n Generation","summary":" Recently, large-scale diffusion models have made impressive progress in\ntext-to-image (T2I) generation. To further equip these T2I models with\nfine-grained spatial control, approaches like ControlNet introduce an extra\nnetwork that learns to follow a condition image. However, for every single\ncondition type, ControlNet requires independent training on millions of data\npairs with hundreds of GPU hours, which is quite expensive and makes it\nchallenging for ordinary users to explore and develop new types of conditions.\nTo address this problem, we propose the CtrLoRA framework, which trains a Base\nControlNet to learn the common knowledge of image-to-image generation from\nmultiple base conditions, along with condition-specific LoRAs to capture\ndistinct characteristics of each condition. Utilizing our pretrained Base\nControlNet, users can easily adapt it to new conditions, requiring as few as\n1,000 data pairs and less than one hour of single-GPU training to obtain\nsatisfactory results in most scenarios. Moreover, our CtrLoRA reduces the\nlearnable parameters by 90% compared to ControlNet, significantly lowering the\nthreshold to distribute and deploy the model weights. Extensive experiments on\nvarious types of conditions demonstrate the efficiency and effectiveness of our\nmethod. Codes and model weights will be released at\nhttps://github.com/xyfJASON/ctrlora.\n","authors":["Yifeng Xu","Zhenliang He","Shiguang Shan","Xilin Chen"],"pdf_url":"https://arxiv.org/pdf/2410.09400v2.pdf","comment":"ICLR 2025. Code: https://github.com/xyfJASON/ctrlora"},{"id":"http://arxiv.org/abs/2410.08190v2","updated":"2025-03-03T12:18:29Z","published":"2024-10-10T17:57:29Z","title":"Poison-splat: Computation Cost Attack on 3D Gaussian Splatting","summary":" 3D Gaussian splatting (3DGS), known for its groundbreaking performance and\nefficiency, has become a dominant 3D representation and brought progress to\nmany 3D vision tasks. However, in this work, we reveal a significant security\nvulnerability that has been largely overlooked in 3DGS: the computation cost of\ntraining 3DGS could be maliciously tampered by poisoning the input data. By\ndeveloping an attack named Poison-splat, we reveal a novel attack surface where\nthe adversary can poison the input images to drastically increase the\ncomputation memory and time needed for 3DGS training, pushing the algorithm\ntowards its worst computation complexity. In extreme cases, the attack can even\nconsume all allocable memory, leading to a Denial-of-Service (DoS) that\ndisrupts servers, resulting in practical damages to real-world 3DGS service\nvendors. Such a computation cost attack is achieved by addressing a bi-level\noptimization problem through three tailored strategies: attack objective\napproximation, proxy model rendering, and optional constrained optimization.\nThese strategies not only ensure the effectiveness of our attack but also make\nit difficult to defend with simple defensive measures. We hope the revelation\nof this novel attack surface can spark attention to this crucial yet overlooked\nvulnerability of 3DGS systems. Our code is available at\nhttps://github.com/jiahaolu97/poison-splat .\n","authors":["Jiahao Lu","Yifan Zhang","Qiuhong Shen","Xinchao Wang","Shuicheng Yan"],"pdf_url":"https://arxiv.org/pdf/2410.08190v2.pdf","comment":"Accepted by ICLR 2025 as a spotlight paper"},{"id":"http://arxiv.org/abs/2502.12138v3","updated":"2025-03-03T12:09:29Z","published":"2025-02-17T18:54:05Z","title":"FLARE: Feed-forward Geometry, Appearance and Camera Estimation from\n Uncalibrated Sparse Views","summary":" We present FLARE, a feed-forward model designed to infer high-quality camera\nposes and 3D geometry from uncalibrated sparse-view images (i.e., as few as 2-8\ninputs), which is a challenging yet practical setting in real-world\napplications. Our solution features a cascaded learning paradigm with camera\npose serving as the critical bridge, recognizing its essential role in mapping\n3D structures onto 2D image planes. Concretely, FLARE starts with camera pose\nestimation, whose results condition the subsequent learning of geometric\nstructure and appearance, optimized through the objectives of geometry\nreconstruction and novel-view synthesis. Utilizing large-scale public datasets\nfor training, our method delivers state-of-the-art performance in the tasks of\npose estimation, geometry reconstruction, and novel view synthesis, while\nmaintaining the inference efficiency (i.e., less than 0.5 seconds). The project\npage and code can be found at: https://zhanghe3z.github.io/FLARE/\n","authors":["Shangzhan Zhang","Jianyuan Wang","Yinghao Xu","Nan Xue","Christian Rupprecht","Xiaowei Zhou","Yujun Shen","Gordon Wetzstein"],"pdf_url":"https://arxiv.org/pdf/2502.12138v3.pdf","comment":"CVPR 2025. Website: https://zhanghe3z.github.io/FLARE/"},{"id":"http://arxiv.org/abs/2502.17941v2","updated":"2025-03-03T12:00:57Z","published":"2025-02-25T08:03:04Z","title":"Optimal Brain Apoptosis","summary":" The increasing complexity and parameter count of Convolutional Neural\nNetworks (CNNs) and Transformers pose challenges in terms of computational\nefficiency and resource demands. Pruning has been identified as an effective\nstrategy to address these challenges by removing redundant elements such as\nneurons, channels, or connections, thereby enhancing computational efficiency\nwithout heavily compromising performance. This paper builds on the foundational\nwork of Optimal Brain Damage (OBD) by advancing the methodology of parameter\nimportance estimation using the Hessian matrix. Unlike previous approaches that\nrely on approximations, we introduce Optimal Brain Apoptosis (OBA), a novel\npruning method that calculates the Hessian-vector product value directly for\neach parameter. By decomposing the Hessian matrix across network layers and\nidentifying conditions under which inter-layer Hessian submatrices are\nnon-zero, we propose a highly efficient technique for computing the\nsecond-order Taylor expansion of parameters. This approach allows for a more\nprecise pruning process, particularly in the context of CNNs and Transformers,\nas validated in our experiments including VGG19, ResNet32, ResNet50, and\nViT-B/16 on CIFAR10, CIFAR100 and Imagenet datasets. Our code is available at\nhttps://github.com/NEU-REAL/OBA.\n","authors":["Mingyuan Sun","Zheng Fang","Jiaxu Wang","Junjie Jiang","Delei Kong","Chenming Hu","Yuetong Fang","Renjing Xu"],"pdf_url":"https://arxiv.org/pdf/2502.17941v2.pdf","comment":"Accepted to ICLR 2025"},{"id":"http://arxiv.org/abs/2407.15589v5","updated":"2025-03-03T11:48:03Z","published":"2024-07-22T12:26:08Z","title":"Exploring the Effectiveness of Object-Centric Representations in Visual\n Question Answering: Comparative Insights with Foundation Models","summary":" Object-centric (OC) representations, which model visual scenes as\ncompositions of discrete objects, have the potential to be used in various\ndownstream tasks to achieve systematic compositional generalization and\nfacilitate reasoning. However, these claims have yet to be thoroughly validated\nempirically. Recently, foundation models have demonstrated unparalleled\ncapabilities across diverse domains, from language to computer vision,\npositioning them as a potential cornerstone of future research for a wide range\nof computational tasks. In this paper, we conduct an extensive empirical study\non representation learning for downstream Visual Question Answering (VQA),\nwhich requires an accurate compositional understanding of the scene. We\nthoroughly investigate the benefits and trade-offs of OC models and alternative\napproaches including large pre-trained foundation models on both synthetic and\nreal-world data, ultimately identifying a promising path to leverage the\nstrengths of both paradigms. The extensiveness of our study, encompassing over\n600 downstream VQA models and 15 different types of upstream representations,\nalso provides several additional insights that we believe will be of interest\nto the community at large.\n","authors":["Amir Mohammad Karimi Mamaghan","Samuele Papa","Karl Henrik Johansson","Stefan Bauer","Andrea Dittadi"],"pdf_url":"https://arxiv.org/pdf/2407.15589v5.pdf","comment":"Published at ICLR 2025"},{"id":"http://arxiv.org/abs/2502.21291v2","updated":"2025-03-03T11:33:31Z","published":"2025-02-28T18:21:08Z","title":"MIGE: A Unified Framework for Multimodal Instruction-Based Image\n Generation and Editing","summary":" Despite significant progress in diffusion-based image generation,\nsubject-driven generation and instruction-based editing remain challenging.\nExisting methods typically treat them separately, struggling with limited\nhigh-quality data and poor generalization. However, both tasks require\ncapturing complex visual variations while maintaining consistency between\ninputs and outputs. Therefore, we propose MIGE, a unified framework that\nstandardizes task representations using multimodal instructions. It treats\nsubject-driven generation as creation on a blank canvas and instruction-based\nediting as modification of an existing image, establishing a shared\ninput-output formulation. MIGE introduces a novel multimodal encoder that maps\nfree-form multimodal instructions into a unified vision-language space,\nintegrating visual and semantic features through a feature fusion mechanism.\nThis unification enables joint training of both tasks, providing two key\nadvantages: (1) Cross-Task Enhancement: By leveraging shared visual and\nsemantic representations, joint training improves instruction adherence and\nvisual consistency in both subject-driven generation and instruction-based\nediting. (2) Generalization: Learning in a unified format facilitates\ncross-task knowledge transfer, enabling MIGE to generalize to novel\ncompositional tasks, including instruction-based subject-driven editing.\nExperiments show that MIGE excels in both subject-driven generation and\ninstruction-based editing while setting a state-of-the-art in the new task of\ninstruction-based subject-driven editing. Code and model have been publicly\navailable at https://github.com/Eureka-Maggie/MIGE.\n","authors":["Xueyun Tian","Wei Li","Bingbing Xu","Yige Yuan","Yuanzhuo Wang","Huawei Shen"],"pdf_url":"https://arxiv.org/pdf/2502.21291v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.18936v4","updated":"2025-03-03T11:00:24Z","published":"2025-01-31T07:41:06Z","title":"Adaptive Prompt: Unlocking the Power of Visual Prompt Tuning","summary":" Visual Prompt Tuning (VPT) has recently emerged as a powerful method for\nadapting pre-trained vision models to downstream tasks. By introducing\nlearnable prompt tokens as task-specific instructions, VPT effectively guides\npre-trained transformer models with minimal overhead. Despite its empirical\nsuccess, a comprehensive theoretical understanding of VPT remains an active\narea of research. Building on recent insights into the connection between\nmixture of experts and prompt-based approaches, we identify a key limitation in\nVPT: the restricted functional expressiveness in prompt formulation. To address\nthis limitation, we propose Visual Adaptive Prompt Tuning (VAPT), a new\ngeneration of prompts that redefines prompts as adaptive functions of the\ninput. Our theoretical analysis shows that this simple yet intuitive approach\nachieves optimal sample efficiency. Empirical results on VTAB-1K and FGVC\nfurther demonstrate VAPT's effectiveness, with performance gains of 7.34% and\n1.04% over fully fine-tuning baselines, respectively. Notably, VAPT also\nsurpasses VPT by a substantial margin while using fewer parameters. These\nresults highlight both the effectiveness and efficiency of our method and pave\nthe way for future research to explore the potential of adaptive prompts.\n","authors":["Minh Le","Anh Nguyen","Huy Nguyen","Chau Nguyen","Nhat Ho"],"pdf_url":"https://arxiv.org/pdf/2501.18936v4.pdf","comment":"57 pages, 10 figures, 18 tables"},{"id":"http://arxiv.org/abs/2410.02423v2","updated":"2025-03-03T10:44:06Z","published":"2024-10-03T12:13:56Z","title":"PnP-Flow: Plug-and-Play Image Restoration with Flow Matching","summary":" In this paper, we introduce Plug-and-Play (PnP) Flow Matching, an algorithm\nfor solving imaging inverse problems. PnP methods leverage the strength of\npre-trained denoisers, often deep neural networks, by integrating them in\noptimization schemes. While they achieve state-of-the-art performance on\nvarious inverse problems in imaging, PnP approaches face inherent limitations\non more generative tasks like inpainting. On the other hand, generative models\nsuch as Flow Matching pushed the boundary in image sampling yet lack a clear\nmethod for efficient use in image restoration. We propose to combine the PnP\nframework with Flow Matching (FM) by defining a time-dependent denoiser using a\npre-trained FM model. Our algorithm alternates between gradient descent steps\non the data-fidelity term, reprojections onto the learned FM path, and\ndenoising. Notably, our method is computationally efficient and\nmemory-friendly, as it avoids backpropagation through ODEs and trace\ncomputations. We evaluate its performance on denoising, super-resolution,\ndeblurring, and inpainting tasks, demonstrating superior results compared to\nexisting PnP algorithms and Flow Matching based state-of-the-art methods.\n","authors":["Ségolène Martin","Anne Gagneux","Paul Hagemann","Gabriele Steidl"],"pdf_url":"https://arxiv.org/pdf/2410.02423v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.11542v2","updated":"2025-03-03T10:39:41Z","published":"2024-12-16T08:22:23Z","title":"Meta Curvature-Aware Minimization for Domain Generalization","summary":" Domain generalization (DG) aims to enhance the ability of models trained on\nsource domains to generalize effectively to unseen domains. Recently,\nSharpness-Aware Minimization (SAM) has shown promise in this area by reducing\nthe sharpness of the loss landscape to obtain more generalized models. However,\nSAM and its variants sometimes fail to guide the model toward a flat minimum,\nand their training processes exhibit limitations, hindering further\nimprovements in model generalization. In this paper, we first propose an\nimproved model training process aimed at encouraging the model to converge to a\nflat minima. To achieve this, we design a curvature metric that has a minimal\neffect when the model is far from convergence but becomes increasingly\ninfluential in indicating the curvature of the minima as the model approaches a\nlocal minimum. Then we derive a novel algorithm from this metric, called Meta\nCurvature-Aware Minimization (MeCAM), to minimize the curvature around the\nlocal minima. Specifically, the optimization objective of MeCAM simultaneously\nminimizes the regular training loss, the surrogate gap of SAM, and the\nsurrogate gap of meta-learning. We provide theoretical analysis on MeCAM's\ngeneralization error and convergence rate, and demonstrate its superiority over\nexisting DG methods through extensive experiments on five benchmark DG\ndatasets, including PACS, VLCS, OfficeHome, TerraIncognita, and DomainNet. Code\nwill be available on GitHub.\n","authors":["Ziyang Chen","Yiwen Ye","Feilong Tang","Yongsheng Pan","Yong Xia"],"pdf_url":"https://arxiv.org/pdf/2412.11542v2.pdf","comment":"22 pages, 5 figures, 17 tables"},{"id":"http://arxiv.org/abs/2502.08005v2","updated":"2025-03-03T10:38:34Z","published":"2025-02-11T23:02:14Z","title":"Towards Training One-Step Diffusion Models Without Distillation","summary":" Recent advances in one-step generative models typically follow a two-stage\nprocess: first training a teacher diffusion model and then distilling it into a\none-step student model. This distillation process traditionally relies on both\nthe teacher model's score function to compute the distillation loss and its\nweights for student initialization. In this paper, we explore whether one-step\ngenerative models can be trained directly without this distillation process.\nFirst, we show that the teacher's score function is not essential and propose a\nfamily of distillation methods that achieve competitive results without relying\non score estimation. Next, we demonstrate that initialization from teacher\nweights is indispensable in successful training. Surprisingly, we find that\nthis benefit is not due to improved ``input-output\" mapping but rather the\nlearned feature representations, which dominate distillation quality. Our\nfindings provide a better understanding of the role of initialization in\none-step model training and its impact on distillation quality.\n","authors":["Mingtian Zhang","Jiajun He","Wenlin Chen","Zijing Ou","José Miguel Hernández-Lobato","Bernhard Schölkopf","David Barber"],"pdf_url":"https://arxiv.org/pdf/2502.08005v2.pdf","comment":"13 pages, Technical Report"},{"id":"http://arxiv.org/abs/2502.21264v2","updated":"2025-03-03T10:35:23Z","published":"2025-02-28T17:40:45Z","title":"Foundation Models -- A Panacea for Artificial Intelligence in Pathology?","summary":" The role of artificial intelligence (AI) in pathology has evolved from aiding\ndiagnostics to uncovering predictive morphological patterns in whole slide\nimages (WSIs). Recently, foundation models (FMs) leveraging self-supervised\npre-training have been widely advocated as a universal solution for diverse\ndownstream tasks. However, open questions remain about their clinical\napplicability and generalization advantages over end-to-end learning using\ntask-specific (TS) models. Here, we focused on AI with clinical-grade\nperformance for prostate cancer diagnosis and Gleason grading. We present the\nlargest validation of AI for this task, using over 100,000 core needle biopsies\nfrom 7,342 patients across 15 sites in 11 countries. We compared two FMs with a\nfully end-to-end TS model in a multiple instance learning framework. Our\nfindings challenge assumptions that FMs universally outperform TS models. While\nFMs demonstrated utility in data-scarce scenarios, their performance converged\nwith - and was in some cases surpassed by - TS models when sufficient labeled\ntraining data were available. Notably, extensive task-specific training\nmarkedly reduced clinically significant misgrading, misdiagnosis of challenging\nmorphologies, and variability across different WSI scanners. Additionally, FMs\nused up to 35 times more energy than the TS model, raising concerns about their\nsustainability. Our results underscore that while FMs offer clear advantages\nfor rapid prototyping and research, their role as a universal solution for\nclinically applicable medical AI remains uncertain. For high-stakes clinical\napplications, rigorous validation and consideration of task-specific training\nremain critically important. We advocate for integrating the strengths of FMs\nand end-to-end learning to achieve robust and resource-efficient AI pathology\nsolutions fit for clinical use.\n","authors":["Nita Mulliqi","Anders Blilie","Xiaoyi Ji","Kelvin Szolnoky","Henrik Olsson","Sol Erika Boman","Matteo Titus","Geraldine Martinez Gonzalez","Julia Anna Mielcarz","Masi Valkonen","Einar Gudlaugsson","Svein R. Kjosavik","José Asenjo","Marcello Gambacorta","Paolo Libretti","Marcin Braun","Radzislaw Kordek","Roman Łowicki","Kristina Hotakainen","Päivi Väre","Bodil Ginnerup Pedersen","Karina Dalsgaard Sørensen","Benedicte Parm Ulhøi","Pekka Ruusuvuori","Brett Delahunt","Hemamali Samaratunga","Toyonori Tsuzuki","Emilius A. M. Janssen","Lars Egevad","Martin Eklund","Kimmo Kartasalo"],"pdf_url":"https://arxiv.org/pdf/2502.21264v2.pdf","comment":"50 pages, 15 figures and an appendix (study protocol) which is\n previously published, see https://doi.org/10.1101/2024.07.04.24309948;\n updated authors list format"},{"id":"http://arxiv.org/abs/2502.21201v2","updated":"2025-03-03T10:32:20Z","published":"2025-02-28T16:18:57Z","title":"The PanAf-FGBG Dataset: Understanding the Impact of Backgrounds in\n Wildlife Behaviour Recognition","summary":" Computer vision analysis of camera trap video footage is essential for\nwildlife conservation, as captured behaviours offer some of the earliest\nindicators of changes in population health. Recently, several high-impact\nanimal behaviour datasets and methods have been introduced to encourage their\nuse; however, the role of behaviour-correlated background information and its\nsignificant effect on out-of-distribution generalisation remain unexplored. In\nresponse, we present the PanAf-FGBG dataset, featuring 20 hours of wild\nchimpanzee behaviours, recorded at over 350 individual camera locations.\nUniquely, it pairs every video with a chimpanzee (referred to as a foreground\nvideo) with a corresponding background video (with no chimpanzee) from the same\ncamera location. We present two views of the dataset: one with overlapping\ncamera locations and one with disjoint locations. This setup enables, for the\nfirst time, direct evaluation of in-distribution and out-of-distribution\nconditions, and for the impact of backgrounds on behaviour recognition models\nto be quantified. All clips come with rich behavioural annotations and metadata\nincluding unique camera IDs and detailed textual scene descriptions.\nAdditionally, we establish several baselines and present a highly effective\nlatent-space normalisation technique that boosts out-of-distribution\nperformance by +5.42% mAP for convolutional and +3.75% mAP for\ntransformer-based models. Finally, we provide an in-depth analysis on the role\nof backgrounds in out-of-distribution behaviour recognition, including the so\nfar unexplored impact of background durations (i.e., the count of background\nframes within foreground videos).\n","authors":["Otto Brookes","Maksim Kukushkin","Majid Mirmehdi","Colleen Stephens","Paula Dieguez","Thurston C. Hicks","Sorrel Jones","Kevin Lee","Maureen S. McCarthy","Amelia Meier","Emmanuelle Normand","Erin G. Wessling","Roman M. Wittig","Kevin Langergraber","Klaus Zuberbühler","Lukas Boesch","Thomas Schmid","Mimi Arandjelovic","Hjalmar Kühl","Tilo Burghardt"],"pdf_url":"https://arxiv.org/pdf/2502.21201v2.pdf","comment":"Accepted at the IEEE / CVF Computer Vision and Pattern Recognition\n Conference 2025"},{"id":"http://arxiv.org/abs/2410.05643v3","updated":"2025-03-03T10:28:30Z","published":"2024-10-08T02:46:30Z","title":"TRACE: Temporal Grounding Video LLM via Causal Event Modeling","summary":" Video Temporal Grounding (VTG) is a crucial capability for video\nunderstanding models and plays a vital role in downstream tasks such as video\nbrowsing and editing. To effectively handle various tasks simultaneously and\nenable zero-shot prediction, there is a growing trend in employing video LLMs\nfor VTG tasks. However, current video LLM-based methods rely exclusively on\nnatural language generation, lacking the ability to model the clear structure\ninherent in videos, which restricts their effectiveness in tackling VTG tasks.\nTo address this issue, this paper first formally introduces causal event\nmodeling framework, which represents video LLM outputs as sequences of events,\nand predict the current event using previous events, video inputs, and textural\ninstructions. Each event consists of three components: timestamps, salient\nscores, and textual captions. We then propose a novel task-interleaved video\nLLM called TRACE to effectively implement the causal event modeling framework\nin practice. The TRACE process visual frames, timestamps, salient scores, and\ntext as distinct tasks, employing various encoders and decoding heads for each.\nTask tokens are arranged in an interleaved sequence according to the causal\nevent modeling framework's formulation. Extensive experiments on various VTG\ntasks and datasets demonstrate the superior performance of TRACE compared to\nstate-of-the-art video LLMs. Our model and code are available at\nhttps://github.com/gyxxyg/TRACE.\n","authors":["Yongxin Guo","Jingyu Liu","Mingda Li","Qingbin Liu","Xi Chen","Xiaoying Tang"],"pdf_url":"https://arxiv.org/pdf/2410.05643v3.pdf","comment":"ICLR 2025"},{"id":"http://arxiv.org/abs/2411.06916v2","updated":"2025-03-03T10:22:24Z","published":"2024-11-11T12:19:28Z","title":"Slowing Down Forgetting in Continual Learning","summary":" A common challenge in continual learning (CL) is catastrophic forgetting,\nwhere the performance on old tasks drops after new, additional tasks are\nlearned. In this paper, we propose a novel framework called ReCL to slow down\nforgetting in CL. Our framework exploits an implicit bias of gradient-based\nneural networks due to which these converge to margin maximization points. Such\nconvergence points allow us to reconstruct old data from previous tasks, which\nwe then combine with the current training data. Our framework is flexible and\ncan be applied on top of existing, state-of-the-art CL methods. We further\ndemonstrate the performance gain from our framework across a large series of\nexperiments, including two challenging CL scenarios (class incremental and\ndomain incremental learning), different datasets (MNIST, CIFAR10,\nTinyImagenet), and different network architectures. Across all experiments, we\nfind large performance gains through ReCL. To the best of our knowledge, our\nframework is the first to address catastrophic forgetting by leveraging models\nin CL as their own memory buffers.\n","authors":["Pascal Janetzky","Tobias Schlagenhauf","Stefan Feuerriegel"],"pdf_url":"https://arxiv.org/pdf/2411.06916v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14651v3","updated":"2025-03-03T09:31:01Z","published":"2024-07-19T20:05:10Z","title":"Improving Representation of High-frequency Components for Medical Visual\n Foundation Models","summary":" Foundation models have recently attracted significant attention for their\nimpressive generalizability across diverse downstream tasks. However, these\nmodels are demonstrated to exhibit great limitations in representing\nhigh-frequency components and fine-grained details. In many medical imaging\ntasks, the precise representation of such information is crucial due to the\ninherently intricate anatomical structures, sub-visual features, and complex\nboundaries involved. Consequently, the limited representation of prevalent\nfoundation models can result in significant performance degradation or even\nfailure in these tasks. To address these challenges, we propose a novel\npretraining strategy, named Frequency-advanced Representation Autoencoder\n(Frepa). Through high-frequency masking and low-frequency perturbation combined\nwith adversarial learning, Frepa encourages the encoder to effectively\nrepresent and preserve high-frequency components in the image embeddings.\nAdditionally, we introduce an innovative histogram-equalized image masking\nstrategy, extending the Masked Autoencoder approach beyond ViT to other\narchitectures such as Swin Transformer and convolutional networks. We develop\nFrepa across nine medical modalities and validate it on 32 downstream tasks for\nboth 2D images and 3D volume data. Without fine-tuning, Frepa can outperform\nother self-supervised pretraining methods and, in some cases, even surpasses\ntask-specific trained models. This improvement is particularly significant for\ntasks involving fine-grained details, such as achieving up to a +15% increase\nin DSC for retina vessel segmentation and a +7% increase in IoU for lung nodule\ndetection. Further experiments quantitatively reveal that Frepa enables\nsuperior high-frequency representations and preservation in the embeddings,\nunderscoring its potential for developing more generalized and universal\nmedical image foundation models.\n","authors":["Yuetan Chu","Yilan Zhang","Zhongyi Han","Changchun Yang","Longxi Zhou","Gongning Luo","Chao Huang","Xin Gao"],"pdf_url":"https://arxiv.org/pdf/2407.14651v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23751v2","updated":"2025-03-03T09:30:42Z","published":"2024-10-31T09:11:56Z","title":"EXACFS -- A CIL Method to mitigate Catastrophic Forgetting","summary":" Deep neural networks (DNNS) excel at learning from static datasets but\nstruggle with continual learning, where data arrives sequentially. Catastrophic\nforgetting, the phenomenon of forgetting previously learned knowledge, is a\nprimary challenge. This paper introduces EXponentially Averaged Class-wise\nFeature Significance (EXACFS) to mitigate this issue in the class incremental\nlearning (CIL) setting. By estimating the significance of model features for\neach learned class using loss gradients, gradually aging the significance\nthrough the incremental tasks and preserving the significant features through a\ndistillation loss, EXACFS effectively balances remembering old knowledge\n(stability) and learning new knowledge (plasticity). Extensive experiments on\nCIFAR-100 and ImageNet-100 demonstrate EXACFS's superior performance in\npreserving stability while acquiring plasticity.\n","authors":["S Balasubramanian","M Sai Subramaniam","Sai Sriram Talasu","Yedu Krishna P","Manepalli Pranav Phanindra Sai","Ravi Mukkamala","Darshan Gera"],"pdf_url":"https://arxiv.org/pdf/2410.23751v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16751v3","updated":"2025-03-03T09:07:59Z","published":"2025-01-28T07:08:20Z","title":"HiBug2: Efficient and Interpretable Error Slice Discovery for\n Comprehensive Model Debugging","summary":" Despite the significant success of deep learning models in computer vision,\nthey often exhibit systematic failures on specific data subsets, known as error\nslices. Identifying and mitigating these error slices is crucial to enhancing\nmodel robustness and reliability in real-world scenarios. In this paper, we\nintroduce HiBug2, an automated framework for error slice discovery and model\nrepair. HiBug2 first generates task-specific visual attributes to highlight\ninstances prone to errors through an interpretable and structured process. It\nthen employs an efficient slice enumeration algorithm to systematically\nidentify error slices, overcoming the combinatorial challenges that arise\nduring slice exploration. Additionally, HiBug2 extends its capabilities by\npredicting error slices beyond the validation set, addressing a key limitation\nof prior approaches. Extensive experiments across multiple domains, including\nimage classification, pose estimation, and object detection - show that HiBug2\nnot only improves the coherence and precision of identified error slices but\nalso significantly enhances the model repair capabilities.\n","authors":["Muxi Chen","Chenchen Zhao","Qiang Xu"],"pdf_url":"https://arxiv.org/pdf/2501.16751v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12275v2","updated":"2025-03-03T09:05:52Z","published":"2024-06-18T05:05:12Z","title":"VoCo-LLaMA: Towards Vision Compression with Large Language Models","summary":" Vision-Language Models (VLMs) have achieved remarkable success in various\nmulti-modal tasks, but they are often bottlenecked by the limited context\nwindow and high computational cost of processing high-resolution image inputs\nand videos. Vision compression can alleviate this problem by reducing the\nvision token count. Previous approaches compress vision tokens with external\nmodules and force LLMs to understand the compressed ones, leading to visual\ninformation loss. However, the LLMs' understanding paradigm of vision tokens is\nnot fully utilised in the compression learning process. We propose VoCo-LLaMA,\nthe first approach to compress vision tokens using LLMs. By introducing Vision\nCompression tokens during the vision instruction tuning phase and leveraging\nattention distillation, our method distill how LLMs comprehend vision tokens\ninto their processing of VoCo tokens. VoCo-LLaMA facilitates effective vision\ncompression and improves the computational efficiency during the inference\nstage. Specifically, our method achieves minimal performance loss with a\ncompression ratio of 576$\\times$, resulting in up to 94.8$\\%$ fewer FLOPs and\n69.6$\\%$ acceleration in inference time. Furthermore, through continuous\ntraining using time-series compressed token sequences of video frames,\nVoCo-LLaMA demonstrates the ability to understand temporal correlations,\noutperforming previous methods on popular video question-answering benchmarks.\nOur approach presents a promising way to unlock the full potential of VLMs'\ncontextual window, enabling more scalable multi-modal applications. The project\npage, along with the associated code, can be accessed via\nhttps://yxxxb.github.io/VoCo-LLaMA-page/.\n","authors":["Xubing Ye","Yukang Gan","Xiaoke Huang","Yixiao Ge","Yansong Tang"],"pdf_url":"https://arxiv.org/pdf/2406.12275v2.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2502.21130v2","updated":"2025-03-03T08:39:54Z","published":"2025-02-28T15:10:07Z","title":"Fast and Accurate Gigapixel Pathological Image Classification with\n Hierarchical Distillation Multi-Instance Learning","summary":" Although multi-instance learning (MIL) has succeeded in pathological image\nclassification, it faces the challenge of high inference costs due to\nprocessing numerous patches from gigapixel whole slide images (WSIs). To\naddress this, we propose HDMIL, a hierarchical distillation multi-instance\nlearning framework that achieves fast and accurate classification by\neliminating irrelevant patches. HDMIL consists of two key components: the\ndynamic multi-instance network (DMIN) and the lightweight instance\npre-screening network (LIPN). DMIN operates on high-resolution WSIs, while LIPN\noperates on the corresponding low-resolution counterparts. During training,\nDMIN are trained for WSI classification while generating attention-score-based\nmasks that indicate irrelevant patches. These masks then guide the training of\nLIPN to predict the relevance of each low-resolution patch. During testing,\nLIPN first determines the useful regions within low-resolution WSIs, which\nindirectly enables us to eliminate irrelevant regions in high-resolution WSIs,\nthereby reducing inference time without causing performance degradation. In\naddition, we further design the first Chebyshev-polynomials-based\nKolmogorov-Arnold classifier in computational pathology, which enhances the\nperformance of HDMIL through learnable activation layers. Extensive experiments\non three public datasets demonstrate that HDMIL outperforms previous\nstate-of-the-art methods, e.g., achieving improvements of 3.13% in AUC while\nreducing inference time by 28.6% on the Camelyon16 dataset.\n","authors":["Jiuyang Dong","Junjun Jiang","Kui Jiang","Jiahan Li","Yongbing Zhang"],"pdf_url":"https://arxiv.org/pdf/2502.21130v2.pdf","comment":"11 pages, 4 figures, accepted by CVPR2025"},{"id":"http://arxiv.org/abs/2311.14922v3","updated":"2025-03-03T07:41:00Z","published":"2023-11-25T03:55:06Z","title":"GDTS: Goal-Guided Diffusion Model with Tree Sampling for Multi-Modal\n Pedestrian Trajectory Prediction","summary":" Accurate prediction of pedestrian trajectories is crucial for improving the\nsafety of autonomous driving. However, this task is generally nontrivial due to\nthe inherent stochasticity of human motion, which naturally requires the\npredictor to generate multi-modal prediction. Previous works leverage various\ngenerative methods, such as GAN and VAE, for pedestrian trajectory prediction.\nNevertheless, these methods may suffer from mode collapse and relatively\nlow-quality results. The denoising diffusion probabilistic model (DDPM) has\nrecently been applied to trajectory prediction due to its simple training\nprocess and powerful reconstruction ability. However, current diffusion-based\nmethods do not fully utilize input information and usually require many\ndenoising iterations that lead to a long inference time or an additional\nnetwork for initialization. To address these challenges and facilitate the use\nof diffusion models in multi-modal trajectory prediction, we propose GDTS, a\nnovel Goal-Guided Diffusion Model with Tree Sampling for multi-modal trajectory\nprediction. Considering the \"goal-driven\" characteristics of human motion, GDTS\nleverages goal estimation to guide the generation of the diffusion network. A\ntwo-stage tree sampling algorithm is presented, which leverages common features\nto reduce the inference time and improve accuracy for multi-modal prediction.\nExperimental results demonstrate that our proposed framework achieves\ncomparable state-of-the-art performance with real-time inference speed in\npublic datasets.\n","authors":["Ge Sun","Sheng Wang","Lei Zhu","Ming Liu","Jun Ma"],"pdf_url":"https://arxiv.org/pdf/2311.14922v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01217v2","updated":"2025-03-03T07:38:09Z","published":"2024-05-02T11:58:06Z","title":"CromSS: Cross-modal pre-training with noisy labels for remote sensing\n image segmentation","summary":" We explore the potential of large-scale noisily labeled data to enhance\nfeature learning by pretraining semantic segmentation models within a\nmulti-modal framework for geospatial applications. We propose a novel\nCross-modal Sample Selection (CromSS) method, a weakly supervised pretraining\nstrategy designed to improve feature representations through cross-modal\nconsistency and noise mitigation techniques. Unlike conventional pretraining\napproaches, CromSS exploits massive amounts of noisy and easy-to-come-by labels\nfor improved feature learning beneficial to semantic segmentation tasks. We\ninvestigate middle and late fusion strategies to optimize the multi-modal\npretraining architecture design. We also introduce a cross-modal sample\nselection module to mitigate the adverse effects of label noise, which employs\na cross-modal entangling strategy to refine the estimated confidence masks\nwithin each modality to guide the sampling process. Additionally, we introduce\na spatial-temporal label smoothing technique to counteract overconfidence for\nenhanced robustness against noisy labels. To validate our approach, we\nassembled the multi-modal dataset, NoLDO-S12, which consists of a large-scale\nnoisy label subset from Google's Dynamic World (DW) dataset for pretraining and\ntwo downstream subsets with high-quality labels from Google DW and\nOpenStreetMap (OSM) for transfer learning. Experimental results on two\ndownstream tasks and the publicly available DFC2020 dataset demonstrate that\nwhen effectively utilized, the low-cost noisy labels can significantly enhance\nfeature learning for segmentation tasks. All data, code, and pretrained weights\nwill be made publicly available.\n","authors":["Chenying Liu","Conrad Albrecht","Yi Wang","Xiao Xiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2405.01217v2.pdf","comment":"The 1st short version was accepted as an oral presentation by ICLR\n 2024 ML4RS workshop. The 2nd extended version is being under review"},{"id":"http://arxiv.org/abs/2501.15394v2","updated":"2025-03-03T07:30:55Z","published":"2025-01-26T04:24:07Z","title":"Doracamom: Joint 3D Detection and Occupancy Prediction with Multi-view\n 4D Radars and Cameras for Omnidirectional Perception","summary":" 3D object detection and occupancy prediction are critical tasks in autonomous\ndriving, attracting significant attention. Despite the potential of recent\nvision-based methods, they encounter challenges under adverse conditions. Thus,\nintegrating cameras with next-generation 4D imaging radar to achieve unified\nmulti-task perception is highly significant, though research in this domain\nremains limited. In this paper, we propose Doracamom, the first framework that\nfuses multi-view cameras and 4D radar for joint 3D object detection and\nsemantic occupancy prediction, enabling comprehensive environmental perception.\nSpecifically, we introduce a novel Coarse Voxel Queries Generator that\nintegrates geometric priors from 4D radar with semantic features from images to\ninitialize voxel queries, establishing a robust foundation for subsequent\nTransformer-based refinement. To leverage temporal information, we design a\nDual-Branch Temporal Encoder that processes multi-modal temporal features in\nparallel across BEV and voxel spaces, enabling comprehensive spatio-temporal\nrepresentation learning. Furthermore, we propose a Cross-Modal BEV-Voxel Fusion\nmodule that adaptively fuses complementary features through attention\nmechanisms while employing auxiliary tasks to enhance feature quality.\nExtensive experiments on the OmniHD-Scenes, View-of-Delft (VoD), and TJ4DRadSet\ndatasets demonstrate that Doracamom achieves state-of-the-art performance in\nboth tasks, establishing new benchmarks for multi-modal 3D perception. Code and\nmodels will be publicly available.\n","authors":["Lianqing Zheng","Jianan Liu","Runwei Guan","Long Yang","Shouyi Lu","Yuanzhe Li","Xiaokai Bai","Jie Bai","Zhixiong Ma","Hui-Liang Shen","Xichan Zhu"],"pdf_url":"https://arxiv.org/pdf/2501.15394v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.19289v3","updated":"2025-03-03T07:18:14Z","published":"2024-11-28T17:41:33Z","title":"ADUGS-VINS: Generalized Visual-Inertial Odometry for Robust Navigation\n in Highly Dynamic and Complex Environments","summary":" Visual-inertial odometry (VIO) is widely used in various fields, such as\nrobots, drones, and autonomous vehicles. However, real-world scenes often\nfeature dynamic objects, compromising the accuracy of VIO. The diversity and\npartial occlusion of these objects present a tough challenge for existing\ndynamic VIO methods. To tackle this challenge, we introduce ADUGS-VINS, which\nintegrates an enhanced SORT algorithm along with a promptable foundation model\ninto VIO, thereby improving pose estimation accuracy in environments with\ndiverse dynamic objects and frequent occlusions. We evaluated our proposed\nmethod using multiple public datasets representing various scenes, as well as\nin a real-world scenario involving diverse dynamic objects. The experimental\nresults demonstrate that our proposed method performs impressively in multiple\nscenarios, outperforming other state-of-the-art methods. This highlights its\nremarkable generalization and adaptability in diverse dynamic environments,\nshowcasing its potential to handle various dynamic objects in practical\napplications.\n","authors":["Rui Zhou","Jingbin Liu","Junbin Xie","Jianyu Zhang","Yingze Hu","Jiele Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.19289v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05757v2","updated":"2025-03-03T07:07:28Z","published":"2025-01-10T07:19:41Z","title":"Locality-aware Gaussian Compression for Fast and High-quality Rendering","summary":" We present LocoGS, a locality-aware 3D Gaussian Splatting (3DGS) framework\nthat exploits the spatial coherence of 3D Gaussians for compact modeling of\nvolumetric scenes. To this end, we first analyze the local coherence of 3D\nGaussian attributes, and propose a novel locality-aware 3D Gaussian\nrepresentation that effectively encodes locally-coherent Gaussian attributes\nusing a neural field representation with a minimal storage requirement. On top\nof the novel representation, LocoGS is carefully designed with additional\ncomponents such as dense initialization, an adaptive spherical harmonics\nbandwidth scheme and different encoding schemes for different Gaussian\nattributes to maximize compression performance. Experimental results\ndemonstrate that our approach outperforms the rendering quality of existing\ncompact Gaussian representations for representative real-world 3D datasets\nwhile achieving from 54.6$\\times$ to 96.6$\\times$ compressed storage size and\nfrom 2.1$\\times$ to 2.4$\\times$ rendering speed than 3DGS. Even our approach\nalso demonstrates an averaged 2.4$\\times$ higher rendering speed than the\nstate-of-the-art compression method with comparable compression performance.\n","authors":["Seungjoo Shin","Jaesik Park","Sunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2501.05757v2.pdf","comment":"Accepted to ICLR 2025. Project page:\n https://seungjooshin.github.io/LocoGS"},{"id":"http://arxiv.org/abs/2501.12296v2","updated":"2025-03-03T06:45:12Z","published":"2025-01-21T17:03:06Z","title":"RALAD: Bridging the Real-to-Sim Domain Gap in Autonomous Driving with\n Retrieval-Augmented Learning","summary":" In the pursuit of robust autonomous driving systems, models trained on\nreal-world datasets often struggle to adapt to new environments, particularly\nwhen confronted with corner cases such as extreme weather conditions.\nCollecting these corner cases in the real world is non-trivial, which\nnecessitates the use of simulators for validation. However,the high\ncomputational cost and the domain gap in data distribution have hindered the\nseamless transition between real and simulated driving scenarios. To tackle\nthis challenge, we propose Retrieval-Augmented Learning for Autonomous Driving\n(RALAD), a novel framework designed to bridge the real-to-sim gap at a low\ncost. RALAD features three primary designs, including (1) domain adaptation via\nan enhanced Optimal Transport (OT) method that accounts for both individual and\ngrouped image distances, (2) a simple and unified framework that can be applied\nto various models, and (3) efficient fine-tuning techniques that freeze the\ncomputationally expensive layers while maintaining robustness. Experimental\nresults demonstrate that RALAD compensates for the performance degradation in\nsimulated environments while maintaining accuracy in real-world scenarios\nacross three different models. Taking Cross View as an example, the mIOU and\nmAP metrics in real-world scenarios remain stable before and after RALAD\nfine-tuning, while in simulated environments,the mIOU and mAP metrics are\nimproved by 10.30% and 12.29%, respectively. Moreover, the re-training cost of\nour approach is reduced by approximately 88.1%. Our code is available at\nhttps://github.com/JiachengZuo/RALAD.git.\n","authors":["Jiacheng Zuo","Haibo Hu","Zikang Zhou","Yufei Cui","Ziquan Liu","Jianping Wang","Nan Guan","Jin Wang","Chun Jason Xue"],"pdf_url":"https://arxiv.org/pdf/2501.12296v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.19160v2","updated":"2025-03-03T06:34:25Z","published":"2024-12-26T10:40:15Z","title":"Cross-Spectral Vision Transformer for Biometric Authentication using\n Forehead Subcutaneous Vein Pattern and Periocular Pattern","summary":" Traditional biometric systems have encountered significant setbacks due to\nvarious unavoidable factors, for example, face recognition-based biometrics\nfails due to the wearing of face masks and fingerprints create hygiene\nconcerns. This paper proposes a novel lightweight cross-spectral vision\ntransformer (CS-ViT) for biometric authentication using forehead subcutaneous\nvein patterns and periocular patterns, offering a promising alternative to\ntraditional methods, capable of performing well even with the face masks and\nwithout any physical touch. The proposed framework comprises a cross-spectral\ndual-channel architecture designed to handle two distinct biometric traits and\nto capture inter-dependencies in terms of relative spectral patterns. Each\nchannel consists of a Phase-Only Correlation Cross-Spectral Attention (POC-CSA)\nthat captures their individual as well as correlated patterns. The computation\nof cross-spectral attention using POC extracts the phase correlation in the\nspatial features. Therefore, it is robust against the resolution/intensity\nvariations and illumination of the input images, assuming both biometric traits\nare from the same person. The lightweight model is suitable for edge device\ndeployment. The performance of the proposed algorithm was rigorously evaluated\nusing the Forehead Subcutaneous Vein Pattern and Periocular Biometric Pattern\n(FSVP-PBP) database. The results demonstrated the superiority of the algorithm\nover state-of-the-art methods, achieving a remarkable classification accuracy\nof 98.8% with the combined vein and periocular patterns.\n","authors":["Arun K. Sharma","Shubhobrata Bhattacharya","Motahar Reza","Bishakh Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2412.19160v2.pdf","comment":"Submitted to IEEE TPAMI"},{"id":"http://arxiv.org/abs/2502.20041v2","updated":"2025-03-03T06:21:57Z","published":"2025-02-27T12:29:44Z","title":"3D-AffordanceLLM: Harnessing Large Language Models for Open-Vocabulary\n Affordance Detection in 3D Worlds","summary":" 3D Affordance detection is a challenging problem with broad applications on\nvarious robotic tasks. Existing methods typically formulate the detection\nparadigm as a label-based semantic segmentation task. This paradigm relies on\npredefined labels and lacks the ability to comprehend complex natural language,\nresulting in limited generalization in open-world scene. To address these\nlimitations, we reformulate the traditional affordance detection paradigm into\n\\textit{Instruction Reasoning Affordance Segmentation} (IRAS) task. This task\nis designed to output a affordance mask region given a query reasoning text,\nwhich avoids fixed categories of input labels. We accordingly propose the\n\\textit{3D-AffordanceLLM} (3D-ADLLM), a framework designed for reasoning\naffordance detection in 3D open-scene. Specifically, 3D-ADLLM introduces large\nlanguage models (LLMs) to 3D affordance perception with a custom-designed\ndecoder for generating affordance masks, thus achieving open-world reasoning\naffordance detection. In addition, given the scarcity of 3D affordance datasets\nfor training large models, we seek to extract knowledge from general\nsegmentation data and transfer it to affordance detection. Thus, we propose a\nmulti-stage training strategy that begins with a novel pre-training task, i.e.,\n\\textit{Referring Object Part Segmentation}~(ROPS). This stage is designed to\nequip the model with general recognition and segmentation capabilities at the\nobject-part level. Then followed by fine-tuning with the IRAS task, 3D-ADLLM\nobtains the reasoning ability for affordance detection. In summary, 3D-ADLLM\nleverages the rich world knowledge and human-object interaction reasoning\nability of LLMs, achieving approximately an 8\\% improvement in mIoU on\nopen-vocabulary affordance detection tasks.\n","authors":["Hengshuo Chu","Xiang Deng","Qi Lv","Xiaoyang Chen","Yinchuan Li","Jianye Hao","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2502.20041v2.pdf","comment":"ICLR"},{"id":"http://arxiv.org/abs/2310.01405v4","updated":"2025-03-03T06:14:14Z","published":"2023-10-02T17:59:07Z","title":"Representation Engineering: A Top-Down Approach to AI Transparency","summary":" In this paper, we identify and characterize the emerging area of\nrepresentation engineering (RepE), an approach to enhancing the transparency of\nAI systems that draws on insights from cognitive neuroscience. RepE places\npopulation-level representations, rather than neurons or circuits, at the\ncenter of analysis, equipping us with novel methods for monitoring and\nmanipulating high-level cognitive phenomena in deep neural networks (DNNs). We\nprovide baselines and an initial analysis of RepE techniques, showing that they\noffer simple yet effective solutions for improving our understanding and\ncontrol of large language models. We showcase how these methods can provide\ntraction on a wide range of safety-relevant problems, including honesty,\nharmlessness, power-seeking, and more, demonstrating the promise of top-down\ntransparency research. We hope that this work catalyzes further exploration of\nRepE and fosters advancements in the transparency and safety of AI systems.\n","authors":["Andy Zou","Long Phan","Sarah Chen","James Campbell","Phillip Guo","Richard Ren","Alexander Pan","Xuwang Yin","Mantas Mazeika","Ann-Kathrin Dombrowski","Shashwat Goel","Nathaniel Li","Michael J. Byun","Zifan Wang","Alex Mallen","Steven Basart","Sanmi Koyejo","Dawn Song","Matt Fredrikson","J. Zico Kolter","Dan Hendrycks"],"pdf_url":"https://arxiv.org/pdf/2310.01405v4.pdf","comment":"Code is available at\n https://github.com/andyzoujm/representation-engineering"},{"id":"http://arxiv.org/abs/2412.10831v2","updated":"2025-03-03T06:13:35Z","published":"2024-12-14T13:28:40Z","title":"Low-Biased General Annotated Dataset Generation","summary":" Pre-training backbone networks on a general annotated dataset (e.g.,\nImageNet) that comprises numerous manually collected images with category\nannotations has proven to be indispensable for enhancing the generalization\ncapacity of downstream visual tasks. However, those manually collected images\noften exhibit bias, which is non-transferable across either categories or\ndomains, thus causing the model's generalization capacity degeneration. To\nmitigate this problem, we present an low-biased general annotated dataset\ngeneration framework (lbGen). Instead of expensive manual collection, we aim at\ndirectly generating low-biased images with category annotations. To achieve\nthis goal, we propose to leverage the advantage of a multimodal foundation\nmodel (e.g., CLIP), in terms of aligning images in an low-biased semantic space\ndefined by language. Specifically, we develop a bi-level semantic alignment\nloss, which not only forces all generated images to be consistent with the\nsemantic distribution of all categories belonging to the target dataset in an\nadversarial learning manner, but also requires each generated image to match\nthe semantic description of its category name. In addition, we further cast an\nexisting image quality scoring model into a quality assurance loss to preserve\nthe quality of the generated image. By leveraging these two loss functions, we\ncan obtain an low-biased image generation model by simply fine-tuning a\npre-trained diffusion model using only all category names in the target dataset\nas input. Experimental results confirm that, compared with the manually labeled\ndataset or other synthetic datasets, the utilization of our generated\nlow-biased datasets leads to stable generalization capacity enhancement of\ndifferent backbone networks across various tasks, especially in tasks where the\nmanually labeled samples are scarce.\n","authors":["Dengyang Jiang","Haoyu Wang","Lei Zhang","Wei Wei","Guang Dai","Mengmeng Wang","Jingdong Wang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.10831v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2410.02268v3","updated":"2025-03-03T05:32:47Z","published":"2024-10-03T07:40:14Z","title":"Structural-Entropy-Based Sample Selection for Efficient and Effective\n Learning","summary":" Sample selection improves the efficiency and effectiveness of machine\nlearning models by providing informative and representative samples. Typically,\nsamples can be modeled as a sample graph, where nodes are samples and edges\nrepresent their similarities. Most existing methods are based on local\ninformation, such as the training difficulty of samples, thereby overlooking\nglobal information, such as connectivity patterns. This oversight can result in\nsuboptimal selection because global information is crucial for ensuring that\nthe selected samples well represent the structural properties of the graph. To\naddress this issue, we employ structural entropy to quantify global information\nand losslessly decompose it from the whole graph to individual nodes using the\nShapley value. Based on the decomposition, we present\n$\\textbf{S}$tructural-$\\textbf{E}$ntropy-based sample $\\textbf{S}$election\n($\\textbf{SES}$), a method that integrates both global and local information to\nselect informative and representative samples. SES begins by constructing a\n$k$NN-graph among samples based on their similarities. It then measures sample\nimportance by combining structural entropy (global metric) with training\ndifficulty (local metric). Finally, SES applies importance-biased blue noise\nsampling to select a set of diverse and representative samples. Comprehensive\nexperiments on three learning scenarios -- supervised learning, active\nlearning, and continual learning -- clearly demonstrate the effectiveness of\nour method.\n","authors":["Tianchi Xie","Jiangning Zhu","Guozu Ma","Minzhi Lin","Wei Chen","Weikai Yang","Shixia Liu"],"pdf_url":"https://arxiv.org/pdf/2410.02268v3.pdf","comment":"Published as a conference paper at ICLR 2025"},{"id":"http://arxiv.org/abs/2404.12379v3","updated":"2025-03-03T05:31:09Z","published":"2024-04-18T17:58:16Z","title":"Dynamic Gaussians Mesh: Consistent Mesh Reconstruction from Dynamic\n Scenes","summary":" Modern 3D engines and graphics pipelines require mesh as a memory-efficient\nrepresentation, which allows efficient rendering, geometry processing, texture\nediting, and many other downstream operations. However, it is still highly\ndifficult to obtain high-quality mesh in terms of detailed structure and time\nconsistency from dynamic observations. To this end, we introduce Dynamic\nGaussians Mesh (DG-Mesh), a framework to reconstruct a high-fidelity and\ntime-consistent mesh from dynamic input. Our work leverages the recent\nadvancement in 3D Gaussian Splatting to construct the mesh sequence with\ntemporal consistency from dynamic observations. Building on top of this\nrepresentation, DG-Mesh recovers high-quality meshes from the Gaussian points\nand can track the mesh vertices over time, which enables applications such as\ntexture editing on dynamic objects. We introduce the Gaussian-Mesh Anchoring,\nwhich encourages evenly distributed Gaussians, resulting better mesh\nreconstruction through mesh-guided densification and pruning on the deformed\nGaussians. By applying cycle-consistent deformation between the canonical and\nthe deformed space, we can project the anchored Gaussian back to the canonical\nspace and optimize Gaussians across all time frames. During the evaluation on\ndifferent datasets, DG-Mesh provides significantly better mesh reconstruction\nand rendering than baselines. Project page: https://www.liuisabella.com/DG-Mesh\n","authors":["Isabella Liu","Hao Su","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.12379v3.pdf","comment":"Project page: https://www.liuisabella.com/DG-Mesh"},{"id":"http://arxiv.org/abs/2410.09374v3","updated":"2025-03-03T05:31:05Z","published":"2024-10-12T05:35:27Z","title":"ESVO2: Direct Visual-Inertial Odometry with Stereo Event Cameras","summary":" Event-based visual odometry is a specific branch of visual Simultaneous\nLocalization and Mapping (SLAM) techniques, which aims at solving tracking and\nmapping subproblems (typically in parallel), by exploiting the special working\nprinciples of neuromorphic (i.e., event-based) cameras. Due to the\nmotion-dependent nature of event data, explicit data association (i.e., feature\nmatching) under large-baseline view-point changes is difficult to establish,\nmaking direct methods a more rational choice. However, state-of-the-art direct\nmethods are limited by the high computational complexity of the mapping\nsub-problem and the degeneracy of camera pose tracking in certain degrees of\nfreedom (DoF) in rotation. In this paper, we tackle these issues by building an\nevent-based stereo visual-inertial odometry system on top of a direct pipeline.\nSpecifically, to speed up the mapping operation, we propose an efficient\nstrategy for sampling contour points according to the local dynamics of events.\nThe mapping performance is also improved in terms of structure completeness and\nlocal smoothness by merging the temporal stereo and static stereo results. To\ncircumvent the degeneracy of camera pose tracking in recovering the pitch and\nyaw components of general 6-DoF motion, we introduce IMU measurements as motion\npriors via pre-integration. To this end, a compact back-end is proposed for\ncontinuously updating the IMU bias and predicting the linear velocity, enabling\nan accurate motion prediction for camera pose tracking. The resulting system\nscales well with modern high-resolution event cameras and leads to better\nglobal positioning accuracy in large-scale outdoor environments. Extensive\nevaluations on five publicly available datasets featuring different resolutions\nand scenarios justify the superior performance of the proposed system against\nfive state-of-the-art methods.\n","authors":["Junkai Niu","Sheng Zhong","Xiuyuan Lu","Shaojie Shen","Guillermo Gallego","Yi Zhou"],"pdf_url":"https://arxiv.org/pdf/2410.09374v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.10988v2","updated":"2025-03-03T05:26:25Z","published":"2025-02-16T04:18:41Z","title":"OMG: Opacity Matters in Material Modeling with Gaussian Splatting","summary":" Decomposing geometry, materials and lighting from a set of images, namely\ninverse rendering, has been a long-standing problem in computer vision and\ngraphics. Recent advances in neural rendering enable photo-realistic and\nplausible inverse rendering results. The emergence of 3D Gaussian Splatting has\nboosted it to the next level by showing real-time rendering potentials. An\nintuitive finding is that the models used for inverse rendering do not take\ninto account the dependency of opacity w.r.t. material properties, namely cross\nsection, as suggested by optics. Therefore, we develop a novel approach that\nadds this dependency to the modeling itself. Inspired by radiative transfer, we\naugment the opacity term by introducing a neural network that takes as input\nmaterial properties to provide modeling of cross section and a physically\ncorrect activation function. The gradients for material properties are\ntherefore not only from color but also from opacity, facilitating a constraint\nfor their optimization. Therefore, the proposed method incorporates more\naccurate physical properties compared to previous works. We implement our\nmethod into 3 different baselines that use Gaussian Splatting for inverse\nrendering and achieve significant improvements universally in terms of novel\nview synthesis and material modeling.\n","authors":["Silong Yong","Venkata Nagarjun Pudureddiyur Manivannan","Bernhard Kerbl","Zifu Wan","Simon Stepputtis","Katia Sycara","Yaqi Xie"],"pdf_url":"https://arxiv.org/pdf/2502.10988v2.pdf","comment":"Published as a conference paper at ICLR 2025"},{"id":"http://arxiv.org/abs/2502.01912v2","updated":"2025-03-03T05:25:43Z","published":"2025-02-04T01:05:12Z","title":"PATCH: a deep learning method to assess heterogeneity of artistic\n practice in historical paintings","summary":" The history of art has seen significant shifts in the manner in which\nartworks are created, making understanding of creative processes a central\nquestion in technical art history. In the Renaissance and Early Modern period,\npaintings were largely produced by master painters directing workshops of\napprentices who often contributed to projects. The masters varied significantly\nin artistic and managerial styles, meaning different combinations of artists\nand implements might be seen both between masters and within workshops or even\nindividual canvases. Information on how different workshops were managed and\nthe processes by which artworks were created remains elusive. Machine learning\nmethods have potential to unearth new information about artists' creative\nprocesses by extending the analysis of brushwork to a microscopic scale.\nAnalysis of workshop paintings, however, presents a challenge in that\ndocumentation of the artists and materials involved is sparse, meaning external\nexamples are not available to train networks to recognize their contributions.\nHere we present a novel machine learning approach we call pairwise assignment\ntraining for classifying heterogeneity (PATCH) that is capable of identifying\nindividual artistic practice regimes with no external training data, or \"ground\ntruth.\" The method achieves unsupervised results by supervised means, and\noutperforms both simple statistical procedures and unsupervised machine\nlearning methods. We apply this method to two historical paintings by the\nSpanish Renaissance master, El Greco: The Baptism of Christ and Christ on the\nCross with Landscape, and our findings regarding the former potentially\nchallenge previous work that has assigned the painting to workshop members.\nFurther, the results of our analyses create a measure of heterogeneity of\nartistic practice that can be used to characterize artworks across time and\nspace.\n","authors":["Andrew Van Horn","Lauryn Smith","Mahamad Mahmoud","Michael McMaster","Clara Pinchbeck","Ina Martin","Andrew Lininger","Anthony Ingrisano","Adam Lowe","Carlos Bayod","Elizabeth Bolman","Kenneth Singer","Michael Hinczewski"],"pdf_url":"https://arxiv.org/pdf/2502.01912v2.pdf","comment":"main text: 16 pages, 6 figures; SI: 7 pages, 3 figures; v2: minor\n typo corrections, higher resolution figures"},{"id":"http://arxiv.org/abs/2402.02112v5","updated":"2025-03-03T04:42:15Z","published":"2024-02-03T10:35:42Z","title":"S-NeRF++: Autonomous Driving Simulation via Neural Reconstruction and\n Generation","summary":" Autonomous driving simulation system plays a crucial role in enhancing\nself-driving data and simulating complex and rare traffic scenarios, ensuring\nnavigation safety. However, traditional simulation systems, which often heavily\nrely on manual modeling and 2D image editing, struggled with scaling to\nextensive scenes and generating realistic simulation data. In this study, we\npresent S-NeRF++, an innovative autonomous driving simulation system based on\nneural reconstruction. Trained on widely-used self-driving datasets such as\nnuScenes and Waymo, S-NeRF++ can generate a large number of realistic street\nscenes and foreground objects with high rendering quality as well as offering\nconsiderable flexibility in manipulation and simulation. Specifically, S-NeRF++\nis an enhanced neural radiance field for synthesizing large-scale scenes and\nmoving vehicles, with improved scene parameterization and camera pose learning.\nThe system effectively utilizes noisy and sparse LiDAR data to refine training\nand address depth outliers, ensuring high-quality reconstruction and novel-view\nrendering. It also provides a diverse foreground asset bank by reconstructing\nand generating different foreground vehicles to support comprehensive scenario\ncreation.Moreover, we have developed an advanced foreground-background fusion\npipeline that skillfully integrates illumination and shadow effects, further\nenhancing the realism of our simulations. With the high-quality simulated data\nprovided by our S-NeRF++, we found the perception methods enjoy performance\nboosts on several autonomous driving downstream tasks, further demonstrating\nour proposed simulator's effectiveness.\n","authors":["Yurui Chen","Junge Zhang","Ziyang Xie","Wenye Li","Feihu Zhang","Jiachen Lu","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.02112v5.pdf","comment":"IEEE TPAMI 2025"},{"id":"http://arxiv.org/abs/2409.07002v2","updated":"2025-03-03T04:32:29Z","published":"2024-09-11T04:30:45Z","title":"AdvLogo: Adversarial Patch Attack against Object Detectors based on\n Diffusion Models","summary":" With the rapid development of deep learning, object detectors have\ndemonstrated impressive performance; however, vulnerabilities still exist in\ncertain scenarios. Current research exploring the vulnerabilities using\nadversarial patches often struggles to balance the trade-off between attack\neffectiveness and visual quality. To address this problem, we propose a novel\nframework of patch attack from semantic perspective, which we refer to as\nAdvLogo. Based on the hypothesis that every semantic space contains an\nadversarial subspace where images can cause detectors to fail in recognizing\nobjects, we leverage the semantic understanding of the diffusion denoising\nprocess and drive the process to adversarial subareas by perturbing the latent\nand unconditional embeddings at the last timestep. To mitigate the distribution\nshift that exposes a negative impact on image quality, we apply perturbation to\nthe latent in frequency domain with the Fourier Transform. Experimental results\ndemonstrate that AdvLogo achieves strong attack performance while maintaining\nhigh visual quality.\n","authors":["Boming Miao","Chunxiao Li","Yao Zhu","Weixiang Sun","Zizhe Wang","Xiaoyi Wang","Chuanlong Xie"],"pdf_url":"https://arxiv.org/pdf/2409.07002v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.18084v2","updated":"2025-03-03T04:31:23Z","published":"2024-10-23T17:59:58Z","title":"DynamicCity: Large-Scale 4D Occupancy Generation from Dynamic Scenes","summary":" Urban scene generation has been developing rapidly recently. However,\nexisting methods primarily focus on generating static and single-frame scenes,\noverlooking the inherently dynamic nature of real-world driving environments.\nIn this work, we introduce DynamicCity, a novel 4D occupancy generation\nframework capable of generating large-scale, high-quality dynamic 4D scenes\nwith semantics. DynamicCity mainly consists of two key models. 1) A VAE model\nfor learning HexPlane as the compact 4D representation. Instead of using naive\naveraging operations, DynamicCity employs a novel Projection Module to\neffectively compress 4D features into six 2D feature maps for HexPlane\nconstruction, which significantly enhances HexPlane fitting quality (up to\n12.56 mIoU gain). Furthermore, we utilize an Expansion & Squeeze Strategy to\nreconstruct 3D feature volumes in parallel, which improves both network\ntraining efficiency and reconstruction accuracy than naively querying each 3D\npoint (up to 7.05 mIoU gain, 2.06x training speedup, and 70.84% memory\nreduction). 2) A DiT-based diffusion model for HexPlane generation. To make\nHexPlane feasible for DiT generation, a Padded Rollout Operation is proposed to\nreorganize all six feature planes of the HexPlane as a squared 2D feature map.\nIn particular, various conditions could be introduced in the diffusion or\nsampling process, supporting versatile 4D generation applications, such as\ntrajectory- and command-driven generation, inpainting, and layout-conditioned\ngeneration. Extensive experiments on the CarlaSC and Waymo datasets demonstrate\nthat DynamicCity significantly outperforms existing state-of-the-art 4D\noccupancy generation methods across multiple metrics. The code and models have\nbeen released to facilitate future research.\n","authors":["Hengwei Bian","Lingdong Kong","Haozhe Xie","Liang Pan","Yu Qiao","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2410.18084v2.pdf","comment":"ICLR 2025 Spotlight; 35 pages, 18 figures, 15 tables; Project Page at\n https://dynamic-city.github.io/"},{"id":"http://arxiv.org/abs/2403.17010v3","updated":"2025-03-03T04:22:19Z","published":"2024-03-25T17:59:59Z","title":"Calib3D: Calibrating Model Preferences for Reliable 3D Scene\n Understanding","summary":" Safety-critical 3D scene understanding tasks necessitate not only accurate\nbut also confident predictions from 3D perception models. This study introduces\nCalib3D, a pioneering effort to benchmark and scrutinize the reliability of 3D\nscene understanding models from an uncertainty estimation viewpoint. We\ncomprehensively evaluate 28 state-of-the-art models across 10 diverse 3D\ndatasets, uncovering insightful phenomena that cope with both the aleatoric and\nepistemic uncertainties in 3D scene understanding. We discover that despite\nachieving impressive levels of accuracy, existing models frequently fail to\nprovide reliable uncertainty estimates -- a pitfall that critically undermines\ntheir applicability in safety-sensitive contexts. Through extensive analysis of\nkey factors such as network capacity, LiDAR representations, rasterization\nresolutions, and 3D data augmentation techniques, we correlate these aspects\ndirectly with the model calibration efficacy. Furthermore, we introduce DeptS,\na novel depth-aware scaling approach aimed at enhancing 3D model calibration.\nExtensive experiments across a wide range of configurations validate the\nsuperiority of our method. We hope this work could serve as a cornerstone for\nfostering reliable 3D scene understanding. Code and benchmark toolkit are\npublicly available.\n","authors":["Lingdong Kong","Xiang Xu","Jun Cen","Wenwei Zhang","Liang Pan","Kai Chen","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2403.17010v3.pdf","comment":"WACV 2025 Oral; 26 pages, 8 figures, 12 tables; Code at\n https://github.com/ldkong1205/Calib3D"},{"id":"http://arxiv.org/abs/2410.03190v3","updated":"2025-03-03T04:11:46Z","published":"2024-10-04T07:05:16Z","title":"Tuning Timestep-Distilled Diffusion Model Using Pairwise Sample\n Optimization","summary":" Recent advancements in timestep-distilled diffusion models have enabled\nhigh-quality image generation that rivals non-distilled multi-step models, but\nwith significantly fewer inference steps. While such models are attractive for\napplications due to the low inference cost and latency, fine-tuning them with a\nnaive diffusion objective would result in degraded and blurry outputs. An\nintuitive alternative is to repeat the diffusion distillation process with a\nfine-tuned teacher model, which produces good results but is cumbersome and\ncomputationally intensive; the distillation training usually requires magnitude\nhigher of training compute compared to fine-tuning for specific image styles.\nIn this paper, we present an algorithm named pairwise sample optimization\n(PSO), which enables the direct fine-tuning of an arbitrary timestep-distilled\ndiffusion model. PSO introduces additional reference images sampled from the\ncurrent time-step distilled model, and increases the relative likelihood margin\nbetween the training images and reference images. This enables the model to\nretain its few-step generation ability, while allowing for fine-tuning of its\noutput distribution. We also demonstrate that PSO is a generalized formulation\nwhich can be flexibly extended to both offline-sampled and online-sampled\npairwise data, covering various popular objectives for diffusion model\npreference optimization. We evaluate PSO in both preference optimization and\nother fine-tuning tasks, including style transfer and concept customization. We\nshow that PSO can directly adapt distilled models to human-preferred generation\nwith both offline and online-generated pairwise preference image data. PSO also\ndemonstrates effectiveness in style transfer and concept customization by\ndirectly tuning timestep-distilled diffusion models.\n","authors":["Zichen Miao","Zhengyuan Yang","Kevin Lin","Ze Wang","Zicheng Liu","Lijuan Wang","Qiang Qiu"],"pdf_url":"https://arxiv.org/pdf/2410.03190v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.21093v2","updated":"2025-03-03T03:48:47Z","published":"2025-02-28T14:32:04Z","title":"FlexDrive: Toward Trajectory Flexibility in Driving Scene Reconstruction\n and Rendering","summary":" Driving scene reconstruction and rendering have advanced significantly using\nthe 3D Gaussian Splatting. However, most prior research has focused on the\nrendering quality along a pre-recorded vehicle path and struggles to generalize\nto out-of-path viewpoints, which is caused by the lack of high-quality\nsupervision in those out-of-path views. To address this issue, we introduce an\nInverse View Warping technique to create compact and high-quality images as\nsupervision for the reconstruction of the out-of-path views, enabling\nhigh-quality rendering results for those views. For accurate and robust inverse\nview warping, a depth bootstrap strategy is proposed to obtain on-the-fly dense\ndepth maps during the optimization process, overcoming the sparsity and\nincompleteness of LiDAR depth data. Our method achieves superior in-path and\nout-of-path reconstruction and rendering performance on the widely used Waymo\nOpen dataset. In addition, a simulator-based benchmark is proposed to obtain\nthe out-of-path ground truth and quantitatively evaluate the performance of\nout-of-path rendering, where our method outperforms previous methods by a\nsignificant margin.\n","authors":["Jingqiu Zhou","Lue Fan","Linjiang Huang","Xiaoyu Shi","Si Liu","Zhaoxiang Zhang","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2502.21093v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.02586v2","updated":"2025-03-03T03:38:29Z","published":"2023-05-04T06:40:11Z","title":"Semantically Structured Image Compression via Irregular Group-Based\n Decoupling","summary":" Image compression techniques typically focus on compressing rectangular\nimages for human consumption, however, resulting in transmitting redundant\ncontent for downstream applications. To overcome this limitation, some previous\nworks propose to semantically structure the bitstream, which can meet specific\napplication requirements by selective transmission and reconstruction.\nNevertheless, they divide the input image into multiple rectangular regions\naccording to semantics and ignore avoiding information interaction among them,\ncausing waste of bitrate and distorted reconstruction of region boundaries. In\nthis paper, we propose to decouple an image into multiple groups with irregular\nshapes based on a customized group mask and compress them independently. Our\ngroup mask describes the image at a finer granularity, enabling significant\nbitrate saving by reducing the transmission of redundant content. Moreover, to\nensure the fidelity of selective reconstruction, this paper proposes the\nconcept of group-independent transform that maintain the independence among\ndistinct groups. And we instantiate it by the proposed Group-Independent\nSwin-Block (GI Swin-Block). Experimental results demonstrate that our framework\nstructures the bitstream with negligible cost, and exhibits superior\nperformance on both visual quality and intelligent task supporting.\n","authors":["Ruoyu Feng","Yixin Gao","Xin Jin","Runsen Feng","Zhibo Chen"],"pdf_url":"https://arxiv.org/pdf/2305.02586v2.pdf","comment":"Accept by ICCV2023"},{"id":"http://arxiv.org/abs/2502.01117v2","updated":"2025-03-03T03:35:00Z","published":"2025-02-03T07:13:59Z","title":"Learning to Learn Weight Generation via Trajectory Diffusion","summary":" Diffusion-based algorithms have emerged as promising techniques for weight\ngeneration, particularly in scenarios like multi-task learning that require\nfrequent weight updates. However, existing solutions suffer from limited\ncross-task transferability. In addition, they only utilize optimal weights as\ntraining samples, ignoring the value of other weights in the optimization\nprocess. To address these issues, we propose Lt-Di, which integrates the\ndiffusion algorithm with meta-learning to generate weights for unseen tasks.\nFurthermore, we extend the vanilla diffusion algorithm into a trajectory\ndiffusion algorithm to utilize other weights along the optimization trajectory.\nTrajectory diffusion decomposes the entire diffusion chain into multiple\nshorter ones, improving training and inference efficiency. We analyze the\nconvergence properties of the weight generation paradigm and improve\nconvergence efficiency without additional time overhead. Our experiments\ndemonstrate Lt-Di's higher accuracy while reducing computational overhead\nacross various tasks, including zero-shot and few-shot learning, multi-domain\ngeneralization, and large-scale language model fine-tuning.Our code is released\nat https://anonymous.4open.science/r/Lt-Di-0E51.\n","authors":["Yunchuan Guan","Yu Liu","Ke Zhou","Zhiqi Shen","Serge Belongie","Jenq-Neng Hwang","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2502.01117v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21256v2","updated":"2025-03-03T03:23:44Z","published":"2024-10-28T17:54:29Z","title":"Multi-modal AI for comprehensive breast cancer prognostication","summary":" Treatment selection in breast cancer is guided by molecular subtypes and\nclinical characteristics. However, current tools including genomic assays lack\nthe accuracy required for optimal clinical decision-making. We developed a\nnovel artificial intelligence (AI)-based approach that integrates digital\npathology images with clinical data, providing a more robust and effective\nmethod for predicting the risk of cancer recurrence in breast cancer patients.\nSpecifically, we utilized a vision transformer pan-cancer foundation model\ntrained with self-supervised learning to extract features from digitized\nH&E-stained slides. These features were integrated with clinical data to form a\nmulti-modal AI test predicting cancer recurrence and death. The test was\ndeveloped and evaluated using data from a total of 8,161 female breast cancer\npatients across 15 cohorts originating from seven countries. Of these, 3,502\npatients from five cohorts were used exclusively for evaluation, while the\nremaining patients were used for training. Our test accurately predicted our\nprimary endpoint, disease-free interval, in the five evaluation cohorts\n(C-index: 0.71 [0.68-0.75], HR: 3.63 [3.02-4.37, p<0.001]). In a direct\ncomparison (n=858), the AI test was more accurate than Oncotype DX, the\nstandard-of-care 21-gene assay, achieving a C-index of 0.67 [0.61-0.74] versus\n0.61 [0.49-0.73], respectively. Additionally, the AI test added independent\nprognostic information to Oncotype DX in a multivariate analysis (HR: 3.11\n[1.91-5.09, p<0.001)]). The test demonstrated robust accuracy across major\nmolecular breast cancer subtypes, including TNBC (C-index: 0.71 [0.62-0.81],\nHR: 3.81 [2.35-6.17, p=0.02]), where no diagnostic tools are currently\nrecommended by clinical guidelines. These results suggest that our AI test\nimproves upon the accuracy of existing prognostic tests, while being applicable\nto a wider range of patients.\n","authors":["Jan Witowski","Ken G. Zeng","Joseph Cappadona","Jailan Elayoubi","Khalil Choucair","Elena Diana Chiru","Nancy Chan","Young-Joon Kang","Frederick Howard","Irina Ostrovnaya","Carlos Fernandez-Granda","Freya Schnabel","Zoe Steinsnyder","Ugur Ozerdem","Kangning Liu","Waleed Abdulsattar","Yu Zong","Lina Daoud","Rafic Beydoun","Anas Saad","Nitya Thakore","Mohammad Sadic","Frank Yeung","Elisa Liu","Theodore Hill","Benjamin Swett","Danielle Rigau","Andrew Clayburn","Valerie Speirs","Marcus Vetter","Lina Sojak","Simone Soysal","Daniel Baumhoer","Jia-Wern Pan","Haslina Makmur","Soo-Hwang Teo","Linda Ma Pak","Victor Angel","Dovile Zilenaite-Petrulaitiene","Arvydas Laurinavicius","Natalie Klar","Brian D. Piening","Carlo Bifulco","Sun-Young Jun","Jae Pak Yi","Su Hyun Lim","Adam Brufsky","Francisco J. Esteva","Lajos Pusztai","Yann LeCun","Krzysztof J. Geras"],"pdf_url":"https://arxiv.org/pdf/2410.21256v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14093v3","updated":"2025-03-03T03:19:31Z","published":"2024-05-23T01:43:54Z","title":"A Survey on Vision-Language-Action Models for Embodied AI","summary":" Embodied AI is widely recognized as a key element of artificial general\nintelligence because it involves controlling embodied agents to perform tasks\nin the physical world. Building on the success of large language models and\nvision-language models, a new category of multimodal models -- referred to as\nvision-language-action models (VLAs) -- has emerged to address\nlanguage-conditioned robotic tasks in embodied AI by leveraging their distinct\nability to generate actions. In recent years, a myriad of VLAs have been\ndeveloped, making it imperative to capture the rapidly evolving landscape\nthrough a comprehensive survey. To this end, we present the first survey on\nVLAs for embodied AI. This work provides a detailed taxonomy of VLAs, organized\ninto three major lines of research. The first line focuses on individual\ncomponents of VLAs. The second line is dedicated to developing control policies\nadept at predicting low-level actions. The third line comprises high-level task\nplanners capable of decomposing long-horizon tasks into a sequence of subtasks,\nthereby guiding VLAs to follow more general user instructions. Furthermore, we\nprovide an extensive summary of relevant resources, including datasets,\nsimulators, and benchmarks. Finally, we discuss the challenges faced by VLAs\nand outline promising future directions in embodied AI.\n","authors":["Yueen Ma","Zixing Song","Yuzheng Zhuang","Jianye Hao","Irwin King"],"pdf_url":"https://arxiv.org/pdf/2405.14093v3.pdf","comment":"16 pages, a survey of vision-language-action models"},{"id":"http://arxiv.org/abs/2501.12844v2","updated":"2025-03-03T03:18:40Z","published":"2025-01-22T12:45:09Z","title":"GAMED-Snake: Gradient-aware Adaptive Momentum Evolution Deep Snake Model\n for Multi-organ Segmentation","summary":" Multi-organ segmentation is a critical yet challenging task due to complex\nanatomical backgrounds, blurred boundaries, and diverse morphologies. This\nstudy introduces the Gradient-aware Adaptive Momentum Evolution Deep Snake\n(GAMED-Snake) model, which establishes a novel paradigm for contour-based\nsegmentation by integrating gradient-based learning with adaptive momentum\nevolution mechanisms. The GAMED-Snake model incorporates three major\ninnovations: First, the Distance Energy Map Prior (DEMP) generates a\npixel-level force field that effectively attracts contour points towards the\ntrue boundaries, even in scenarios with complex backgrounds and blurred edges.\nSecond, the Differential Convolution Inception Module (DCIM) precisely extracts\ncomprehensive energy gradients, significantly enhancing segmentation accuracy.\nThird, the Adaptive Momentum Evolution Mechanism (AMEM) employs cross-attention\nto establish dynamic features across different iterations of evolution,\nenabling precise boundary alignment for diverse morphologies. Experimental\nresults on four challenging multi-organ segmentation datasets demonstrate that\nGAMED-Snake improves the mDice metric by approximately 2% compared to\nstate-of-the-art methods. Code will be available at\nhttps://github.com/SYSUzrc/GAMED-Snake.\n","authors":["Ruicheng Zhang","Haowei Guo","Zeyu Zhang","Puxin Yan","Shen Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.12844v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.16826v2","updated":"2025-03-03T03:09:49Z","published":"2025-02-24T04:23:21Z","title":"Noise2Score3D:Unsupervised Tweedie's Approach for Point Cloud Denoising","summary":" Building on recent advances in Bayesian statistics and image denoising, we\npropose Noise2Score3D, a fully unsupervised framework for point cloud denoising\nthat addresses the critical challenge of limited availability of clean data.\nNoise2Score3D learns the gradient of the underlying point cloud distribution\ndirectly from noisy data, eliminating the need for clean data during training.\nBy leveraging Tweedie's formula, our method performs inference in a single\nstep, avoiding the iterative processes used in existing unsupervised methods,\nthereby improving both performance and efficiency. Experimental results\ndemonstrate that Noise2Score3D achieves state-of-the-art performance on\nstandard benchmarks, outperforming other unsupervised methods in Chamfer\ndistance and point-to-mesh metrics, and rivaling some supervised approaches.\nFurthermore, Noise2Score3D demonstrates strong generalization ability beyond\ntraining datasets. Additionally, we introduce Total Variation for Point Cloud,\na criterion that allows for the estimation of unknown noise parameters, which\nfurther enhances the method's versatility and real-world utility.\n","authors":["Xiangbin Wei"],"pdf_url":"https://arxiv.org/pdf/2502.16826v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13085v2","updated":"2025-03-03T03:08:28Z","published":"2024-10-16T23:03:27Z","title":"MMed-RAG: Versatile Multimodal RAG System for Medical Vision Language\n Models","summary":" Artificial Intelligence (AI) has demonstrated significant potential in\nhealthcare, particularly in disease diagnosis and treatment planning. Recent\nprogress in Medical Large Vision-Language Models (Med-LVLMs) has opened up new\npossibilities for interactive diagnostic tools. However, these models often\nsuffer from factual hallucination, which can lead to incorrect diagnoses.\nFine-tuning and retrieval-augmented generation (RAG) have emerged as methods to\naddress these issues. However, the amount of high-quality data and distribution\nshifts between training data and deployment data limit the application of\nfine-tuning methods. Although RAG is lightweight and effective, existing\nRAG-based approaches are not sufficiently general to different medical domains\nand can potentially cause misalignment issues, both between modalities and\nbetween the model and the ground truth. In this paper, we propose a versatile\nmultimodal RAG system, MMed-RAG, designed to enhance the factuality of\nMed-LVLMs. Our approach introduces a domain-aware retrieval mechanism, an\nadaptive retrieved contexts selection method, and a provable RAG-based\npreference fine-tuning strategy. These innovations make the RAG process\nsufficiently general and reliable, significantly improving alignment when\nintroducing retrieved contexts. Experimental results across five medical\ndatasets (involving radiology, ophthalmology, pathology) on medical VQA and\nreport generation demonstrate that MMed-RAG can achieve an average improvement\nof 43.8% in the factual accuracy of Med-LVLMs. Our data and code are available\nin https://github.com/richard-peng-xia/MMed-RAG.\n","authors":["Peng Xia","Kangyu Zhu","Haoran Li","Tianze Wang","Weijia Shi","Sheng Wang","Linjun Zhang","James Zou","Huaxiu Yao"],"pdf_url":"https://arxiv.org/pdf/2410.13085v2.pdf","comment":"ICLR 2025"},{"id":"http://arxiv.org/abs/2409.06214v3","updated":"2025-03-03T01:46:42Z","published":"2024-09-10T04:45:25Z","title":"Towards Generalizable Scene Change Detection","summary":" While current state-of-the-art Scene Change Detection (SCD) approaches\nachieve impressive results in well-trained research data, they become\nunreliable under unseen environments and different temporal conditions;\nin-domain performance drops from 77.6\\% to 8.0\\% in a previously unseen\nenvironment and to 4.6\\% under a different temporal condition -- calling for\ngeneralizable SCD and benchmark. In this work, we propose the Generalizable\nScene Change Detection Framework (GeSCF), which addresses unseen domain\nperformance and temporal consistency -- to meet the growing demand for anything\nSCD. Our method leverages the pre-trained Segment Anything Model (SAM) in a\nzero-shot manner. For this, we design Initial Pseudo-mask Generation and\nGeometric-Semantic Mask Matching -- seamlessly turning user-guided prompt and\nsingle-image based segmentation into scene change detection for a pair of\ninputs without guidance. Furthermore, we define the Generalizable Scene Change\nDetection (GeSCD) benchmark along with novel metrics and an evaluation protocol\nto facilitate SCD research in generalizability. In the process, we introduce\nthe ChangeVPR dataset, a collection of challenging image pairs with diverse\nenvironmental scenarios -- including urban, suburban, and rural settings.\nExtensive experiments across various datasets demonstrate that GeSCF achieves\nan average performance gain of 19.2\\% on existing SCD datasets and 30.0\\% on\nthe ChangeVPR dataset, nearly doubling the prior art performance. We believe\nour work can lay a solid foundation for robust and generalizable SCD research.\n","authors":["Jaewoo Kim","Uehwan Kim"],"pdf_url":"https://arxiv.org/pdf/2409.06214v3.pdf","comment":"Manuscript. Accepted to CVPR 2025"},{"id":"http://arxiv.org/abs/2502.08079v3","updated":"2025-03-03T01:35:58Z","published":"2025-02-12T02:53:27Z","title":"MAA: Meticulous Adversarial Attack against Vision-Language Pre-trained\n Models","summary":" Current adversarial attacks for evaluating the robustness of vision-language\npre-trained (VLP) models in multi-modal tasks suffer from limited\ntransferability, where attacks crafted for a specific model often struggle to\ngeneralize effectively across different models, limiting their utility in\nassessing robustness more broadly. This is mainly attributed to the\nover-reliance on model-specific features and regions, particularly in the image\nmodality. In this paper, we propose an elegant yet highly effective method\ntermed Meticulous Adversarial Attack (MAA) to fully exploit model-independent\ncharacteristics and vulnerabilities of individual samples, achieving enhanced\ngeneralizability and reduced model dependence. MAA emphasizes fine-grained\noptimization of adversarial images by developing a novel resizing and sliding\ncrop (RScrop) technique, incorporating a multi-granularity similarity\ndisruption (MGSD) strategy. Extensive experiments across diverse VLP models,\nmultiple benchmark datasets, and a variety of downstream tasks demonstrate that\nMAA significantly enhances the effectiveness and transferability of adversarial\nattacks. A large cohort of performance studies is conducted to generate\ninsights into the effectiveness of various model configurations, guiding future\nadvancements in this domain.\n","authors":["Peng-Fei Zhang","Guangdong Bai","Zi Huang"],"pdf_url":"https://arxiv.org/pdf/2502.08079v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14519v2","updated":"2025-03-03T00:51:41Z","published":"2024-09-22T16:25:31Z","title":"RobotFingerPrint: Unified Gripper Coordinate Space for Multi-Gripper\n Grasp Synthesis and Transfer","summary":" We introduce a novel grasp representation named the Unified Gripper\nCoordinate Space (UGCS) for grasp synthesis and grasp transfer. Our\nrepresentation leverages spherical coordinates to create a shared coordinate\nspace across different robot grippers, enabling it to synthesize and transfer\ngrasps for both novel objects and previously unseen grippers. The strength of\nthis representation lies in the ability to map palm and fingers of a gripper\nand the unified coordinate space. Grasp synthesis is formulated as predicting\nthe unified spherical coordinates on object surface points via a conditional\nvariational autoencoder. The predicted unified gripper coordinates establish\nexact correspondences between the gripper and object points, which is used to\noptimize grasp pose and joint values. Grasp transfer is facilitated through the\npoint-to-point correspondence between any two (potentially unseen) grippers and\nsolved via a similar optimization. Extensive simulation and real-world\nexperiments showcase the efficacy of the unified grasp representation for grasp\nsynthesis in generating stable and diverse grasps. Similarly, we showcase\nreal-world grasp transfer from human demonstrations across different objects.\n","authors":["Ninad Khargonkar","Luis Felipe Casas","Balakrishnan Prabhakaran","Yu Xiang"],"pdf_url":"https://arxiv.org/pdf/2409.14519v2.pdf","comment":"8 pages, 11 figures, 3 tables. Project page available at\n https://irvlutd.github.io/RobotFingerPrint"},{"id":"http://arxiv.org/abs/2410.01417v2","updated":"2025-03-03T00:41:36Z","published":"2024-10-02T10:58:54Z","title":"The Labyrinth of Links: Navigating the Associative Maze of Multi-modal\n LLMs","summary":" Multi-modal Large Language Models (MLLMs) have exhibited impressive\ncapability. However, recently many deficiencies of MLLMs have been found\ncompared to human intelligence, $\\textit{e.g.}$, hallucination. To drive the\nMLLMs study, the community dedicated efforts to building larger benchmarks with\ncomplex tasks. In this paper, we propose benchmarking an essential but usually\noverlooked intelligence: $\\textbf{association}$, a human's basic capability to\nlink observation and prior practice memory. To comprehensively investigate\nMLLM's performance on the association, we formulate the association task and\ndevise a standard benchmark based on adjective and verb semantic concepts.\nInstead of costly data annotation and curation, we propose a convenient\n$\\textbf{annotation-free}$ construction method transforming the general dataset\nfor our association tasks. Simultaneously, we devise a rigorous data refinement\nprocess to eliminate confusion in the raw dataset. Building on this database,\nwe establish three levels of association tasks: single-step, synchronous, and\nasynchronous associations. Moreover, we conduct a comprehensive investigation\ninto the MLLMs' zero-shot association capabilities, addressing multiple\ndimensions, including three distinct memory strategies, both open-source and\nclosed-source MLLMs, cutting-edge Mixture-of-Experts (MoE) models, and the\ninvolvement of human experts. Our systematic investigation shows that current\nopen-source MLLMs consistently exhibit poor capability in our association\ntasks, even the currently state-of-the-art GPT-4V(vision) also has a\nsignificant gap compared to humans. We believe our benchmark would pave the way\nfor future MLLM studies. $\\textit{Our data and code are available at:}$\nhttps://mvig-rhos.com/llm_inception.\n","authors":["Hong Li","Nanxi Li","Yuanjie Chen","Jianbin Zhu","Qinlu Guo","Cewu Lu","Yong-Lu Li"],"pdf_url":"https://arxiv.org/pdf/2410.01417v2.pdf","comment":"Accepted by ICLR 2025. Project page:\n https://mvig-rhos.com/llm_inception"},{"id":"http://arxiv.org/abs/2409.04607v2","updated":"2025-03-03T00:20:29Z","published":"2024-09-06T20:32:53Z","title":"Self-Supervised Contrastive Learning for Videos using Differentiable\n Local Alignment","summary":" Robust frame-wise embeddings are essential to perform video analysis and\nunderstanding tasks. We present a self-supervised method for representation\nlearning based on aligning temporal video sequences. Our framework uses a\ntransformer-based encoder to extract frame-level features and leverages them to\nfind the optimal alignment path between video sequences. We introduce the novel\nLocal-Alignment Contrastive (LAC) loss, which combines a differentiable local\nalignment loss to capture local temporal dependencies with a contrastive loss\nto enhance discriminative learning. Prior works on video alignment have focused\non using global temporal ordering across sequence pairs, whereas our loss\nencourages identifying the best-scoring subsequence alignment. LAC uses the\ndifferentiable Smith-Waterman (SW) affine method, which features a flexible\nparameterization learned through the training phase, enabling the model to\nadjust the temporal gap penalty length dynamically. Evaluations show that our\nlearned representations outperform existing state-of-the-art approaches on\naction recognition tasks.\n","authors":["Keyne Oei","Amr Gomaa","Anna Maria Feit","João Belo"],"pdf_url":"https://arxiv.org/pdf/2409.04607v2.pdf","comment":"Accepted in 2nd Workshop on Video Understanding and its Applications,\n held in conjunction with the British Machine Vision Conference (BMVC) 2024"}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2502.12215v2","updated":"2025-03-03T15:29:43Z","published":"2025-02-17T07:21:11Z","title":"Revisiting the Test-Time Scaling of o1-like Models: Do they Truly\n Possess Test-Time Scaling Capabilities?","summary":" The advent of test-time scaling in large language models (LLMs), exemplified\nby OpenAI's o1 series, has advanced reasoning capabilities by scaling\ncomputational resource allocation during inference. While successors like QwQ,\nDeepseek-R1 (R1) and LIMO replicate these advancements, whether these models\ntruly possess test-time scaling capabilities remains underexplored. This study\nfound that longer CoTs of these o1-like models do not consistently enhance\naccuracy; in fact, correct solutions are often shorter than incorrect ones for\nthe same questions. Further investigation shows this phenomenon is closely\nrelated to models' self-revision capabilities - longer CoTs contain more\nself-revisions, which often lead to performance degradation. We then compare\nsequential and parallel scaling strategies on QwQ, R1 and LIMO, finding that\nparallel scaling achieves better coverage and scalability. Based on these\ninsights, we propose Shortest Majority Vote, a method that combines parallel\nscaling strategies with CoT length characteristics, significantly improving\nmodels' test-time scalability compared to conventional majority voting\napproaches.\n","authors":["Zhiyuan Zeng","Qinyuan Cheng","Zhangyue Yin","Yunhua Zhou","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2502.12215v2.pdf","comment":"Add the github link"},{"id":"http://arxiv.org/abs/2502.19723v2","updated":"2025-03-03T15:07:28Z","published":"2025-02-27T03:25:34Z","title":"CNsum:Automatic Summarization for Chinese News Text","summary":" Obtaining valuable information from massive data efficiently has become our\nresearch goal in the era of Big Data. Text summarization technology has been\ncontinuously developed to meet this demand. Recent work has also shown that\ntransformer-based pre-trained language models have achieved great success on\nvarious tasks in Natural Language Processing (NLP). Aiming at the problem of\nChinese news text summary generation and the application of Transformer\nstructure on Chinese, this paper proposes a Chinese news text summarization\nmodel (CNsum) based on Transformer structure, and tests it on Chinese datasets\nsuch as THUCNews. The results of the conducted experiments show that CNsum\nachieves better ROUGE score than the baseline models, which verifies the\noutperformance of the model.\n","authors":["Yu Zhao","Songping Huang","Dongsheng Zhou","Zhaoyun Ding","Fei Wang","Aixin Nian"],"pdf_url":"https://arxiv.org/pdf/2502.19723v2.pdf","comment":"This withdrawal is due to the lack of authorization from all\n co-authors for the publication of this version"},{"id":"http://arxiv.org/abs/2410.23208v2","updated":"2025-03-03T14:29:16Z","published":"2024-10-30T16:59:41Z","title":"Kinetix: Investigating the Training of General Agents through Open-Ended\n Physics-Based Control Tasks","summary":" While large models trained with self-supervised learning on offline datasets\nhave shown remarkable capabilities in text and image domains, achieving the\nsame generalisation for agents that act in sequential decision problems remains\nan open challenge. In this work, we take a step towards this goal by\nprocedurally generating tens of millions of 2D physics-based tasks and using\nthese to train a general reinforcement learning (RL) agent for physical\ncontrol. To this end, we introduce Kinetix: an open-ended space of\nphysics-based RL environments that can represent tasks ranging from robotic\nlocomotion and grasping to video games and classic RL environments, all within\na unified framework. Kinetix makes use of our novel hardware-accelerated\nphysics engine Jax2D that allows us to cheaply simulate billions of environment\nsteps during training. Our trained agent exhibits strong physical reasoning\ncapabilities in 2D space, being able to zero-shot solve unseen human-designed\nenvironments. Furthermore, fine-tuning this general agent on tasks of interest\nshows significantly stronger performance than training an RL agent *tabula\nrasa*. This includes solving some environments that standard RL training\ncompletely fails at. We believe this demonstrates the feasibility of large\nscale, mixed-quality pre-training for online RL and we hope that Kinetix will\nserve as a useful framework to investigate this further.\n","authors":["Michael Matthews","Michael Beukman","Chris Lu","Jakob Foerster"],"pdf_url":"https://arxiv.org/pdf/2410.23208v2.pdf","comment":"ICLR 2025 Oral. The first two authors contributed equally. Project\n page located at: https://kinetix-env.github.io/"},{"id":"http://arxiv.org/abs/2412.11948v2","updated":"2025-03-03T13:58:56Z","published":"2024-12-16T16:31:00Z","title":"OpenReviewer: A Specialized Large Language Model for Generating Critical\n Scientific Paper Reviews","summary":" We present OpenReviewer, an open-source system for generating high-quality\npeer reviews of machine learning and AI conference papers. At its core is\nLlama-OpenReviewer-8B, an 8B parameter language model specifically fine-tuned\non 79,000 expert reviews from top conferences. Given a PDF paper submission and\nreview template as input, OpenReviewer extracts the full text, including\ntechnical content like equations and tables, and generates a structured review\nfollowing conference-specific guidelines. Our evaluation on 400 test papers\nshows that OpenReviewer produces considerably more critical and realistic\nreviews compared to general-purpose LLMs like GPT-4 and Claude-3.5. While other\nLLMs tend toward overly positive assessments, OpenReviewer's recommendations\nclosely match the distribution of human reviewer ratings. The system provides\nauthors with rapid, constructive feedback to improve their manuscripts before\nsubmission, though it is not intended to replace human peer review.\nOpenReviewer is available as an online demo and open-source tool.\n","authors":["Maximilian Idahl","Zahra Ahmadi"],"pdf_url":"https://arxiv.org/pdf/2412.11948v2.pdf","comment":"Demo: https://huggingface.co/spaces/maxidl/openreviewer Model:\n https://huggingface.co/maxidl/Llama-OpenReviewer-8B To appear at NAACL 2025\n System Demonstrations Track"},{"id":"http://arxiv.org/abs/2411.04671v3","updated":"2025-03-03T13:41:33Z","published":"2024-11-07T12:55:17Z","title":"CUIfy the XR: An Open-Source Package to Embed LLM-powered Conversational\n Agents in XR","summary":" Recent developments in computer graphics, machine learning, and sensor\ntechnologies enable numerous opportunities for extended reality (XR) setups for\neveryday life, from skills training to entertainment. With large corporations\noffering affordable consumer-grade head-mounted displays (HMDs), XR will likely\nbecome pervasive, and HMDs will develop as personal devices like smartphones\nand tablets. However, having intelligent spaces and naturalistic interactions\nin XR is as important as technological advances so that users grow their\nengagement in virtual and augmented spaces. To this end, large language model\n(LLM)--powered non-player characters (NPCs) with speech-to-text (STT) and\ntext-to-speech (TTS) models bring significant advantages over conventional or\npre-scripted NPCs for facilitating more natural conversational user interfaces\n(CUIs) in XR. This paper provides the community with an open-source,\ncustomizable, extendable, and privacy-aware Unity package, CUIfy, that\nfacilitates speech-based NPC-user interaction with widely used LLMs, STT, and\nTTS models. Our package also supports multiple LLM-powered NPCs per environment\nand minimizes latency between different computational models through streaming\nto achieve usable interactions between users and NPCs. We publish our source\ncode in the following repository: https://gitlab.lrz.de/hctl/cuify\n","authors":["Kadir Burak Buldu","Süleyman Özdel","Ka Hei Carrie Lau","Mengdi Wang","Daniel Saad","Sofie Schönborn","Auxane Boch","Enkelejda Kasneci","Efe Bozkir"],"pdf_url":"https://arxiv.org/pdf/2411.04671v3.pdf","comment":"7th IEEE International Conference on Artificial Intelligence &\n eXtended and Virtual Reality (IEEE AIxVR 2025)"},{"id":"http://arxiv.org/abs/2502.18858v2","updated":"2025-03-03T13:38:50Z","published":"2025-02-26T05:59:45Z","title":"Evaluating Intelligence via Trial and Error","summary":" Intelligence is a crucial trait for species to find solutions within a\nlimited number of trial-and-error attempts. Building on this idea, we introduce\nSurvival Game as a framework to evaluate intelligence based on the number of\nfailed attempts in a trial-and-error process. Fewer failures indicate higher\nintelligence. When the expectation and variance of failure counts are both\nfinite, it signals the ability to consistently find solutions to new\nchallenges, which we define as the Autonomous Level of intelligence. Using\nSurvival Game, we comprehensively evaluate existing AI systems. Our results\nshow that while AI systems achieve the Autonomous Level in simple tasks, they\nare still far from it in more complex tasks, such as vision, search,\nrecommendation, and language. While scaling current AI technologies might help,\nthis would come at an astronomical cost. Projections suggest that achieving the\nAutonomous Level for general tasks would require $10^{26}$ parameters. To put\nthis into perspective, loading such a massive model requires so many H100 GPUs\nthat their total value is $10^{7}$ times that of Apple Inc.'s market value.\nEven with Moore's Law, supporting such a parameter scale would take $70$ years.\nThis staggering cost highlights the complexity of human tasks and the\ninadequacies of current AI technologies. To further investigate this\nphenomenon, we conduct a theoretical analysis of Survival Game and its\nexperimental results. Our findings suggest that human tasks possess a\ncriticality property. As a result, Autonomous Level requires a deep\nunderstanding of the task's underlying mechanisms. Current AI systems, however,\ndo not fully grasp these mechanisms and instead rely on superficial mimicry,\nmaking it difficult for them to reach an autonomous level. We believe Survival\nGame can not only guide the future development of AI but also offer profound\ninsights into human intelligence.\n","authors":["Jingtao Zhan","Jiahao Zhao","Jiayu Li","Yiqun Liu","Bo Zhang","Qingyao Ai","Jiaxin Mao","Hongning Wang","Min Zhang","Shaoping Ma"],"pdf_url":"https://arxiv.org/pdf/2502.18858v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.17711v2","updated":"2025-03-03T13:19:42Z","published":"2024-11-17T17:32:58Z","title":"AnyECG: Foundational Models for Multitask Cardiac Analysis in Real-World\n Settings","summary":" Electrocardiogram (ECG), a non-invasive and affordable tool for cardiac\nmonitoring, is highly sensitive in detecting acute heart attacks. However, due\nto the lengthy nature of ECG recordings, numerous machine learning methods have\nbeen developed for automated heart disease detection to reduce human workload.\nDespite these efforts, performance remains suboptimal. A key obstacle is the\ninherent complexity of ECG data, which includes heterogeneity (e.g., varying\nsampling rates), high levels of noise, demographic-related pattern shifts, and\nintricate rhythm-event associations. To overcome these challenges, this paper\nintroduces AnyECG, a foundational model designed to extract robust\nrepresentations from any real-world ECG data. Specifically, a tailored ECG\nTokenizer encodes each fixed-duration ECG fragment into a token and, guided by\nproxy tasks, converts noisy, continuous ECG features into discrete, compact,\nand clinically meaningful local rhythm codes. These codes encapsulate basic\nmorphological, frequency, and demographic information (e.g., sex), effectively\nmitigating signal noise. We further pre-train the AnyECG to learn rhythmic\npattern associations across ECG tokens, enabling the capture of cardiac event\nsemantics. By being jointly pre-trained on diverse ECG data sources, AnyECG is\ncapable of generalizing across a wide range of downstream tasks where ECG\nsignals are recorded from various devices and scenarios. The experimental\nresults show that AnyECG achieves an average performance improvement of 6%\nacross four critical tasks-anomaly detection, arrhythmia classification,\ncorrupted lead generation, and ultra-long ECG recognition. AnyECG learns common\nECG rhythm from data and significantly outperforms state-of-the-art methods in\neach of these tasks.\n","authors":["Yue Wang","Xu Cao","Yaojun Hu","Haochao Ying","Hongxia Xu","Ruijia Wu","James Matthew Rehg","Jimeng Sun","Jian Wu","Jintai Chen"],"pdf_url":"https://arxiv.org/pdf/2411.17711v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.07076v4","updated":"2025-03-03T13:17:24Z","published":"2024-10-09T17:19:58Z","title":"MOOSE-Chem: Large Language Models for Rediscovering Unseen Chemistry\n Scientific Hypotheses","summary":" Scientific discovery contributes largely to human society's prosperity, and\nrecent progress shows that LLMs could potentially catalyze this process.\nHowever, it is still unclear whether LLMs can discover novel and valid\nhypotheses in chemistry. In this work, we investigate this central research\nquestion: Can LLMs automatically discover novel and valid chemistry research\nhypotheses given only a chemistry research background (consisting of a research\nquestion and/or a background survey), without limitation on the domain of the\nresearch question? After extensive discussions with chemistry experts, we\npropose an assumption that a majority of chemistry hypotheses can be resulted\nfrom a research background and several inspirations. With this key insight, we\nbreak the central question into three smaller fundamental questions. In brief,\nthey are: (1) given a background question, whether LLMs can retrieve good\ninspirations; (2) with background and inspirations, whether LLMs can lead to\nhypothesis; and (3) whether LLMs can identify good hypotheses to rank them\nhigher. To investigate these questions, we construct a benchmark consisting of\n51 chemistry papers published in Nature, Science, or a similar level in 2024\n(all papers are only available online since 2024). Every paper is divided by\nchemistry PhD students into three components: background, inspirations, and\nhypothesis. The goal is to rediscover the hypothesis, given only the background\nand a large randomly selected chemistry literature corpus consisting the ground\ntruth inspiration papers, with LLMs trained with data up to 2023. We also\ndevelop an LLM-based multi-agent framework that leverages the assumption,\nconsisting of three stages reflecting the three smaller questions. The proposed\nmethod can rediscover many hypotheses with very high similarity with the ground\ntruth ones, covering the main innovations.\n","authors":["Zonglin Yang","Wanhao Liu","Ben Gao","Tong Xie","Yuqiang Li","Wanli Ouyang","Soujanya Poria","Erik Cambria","Dongzhan Zhou"],"pdf_url":"https://arxiv.org/pdf/2410.07076v4.pdf","comment":"Accepted by ICLR 2025"},{"id":"http://arxiv.org/abs/2501.09555v3","updated":"2025-03-03T13:05:35Z","published":"2025-01-16T14:18:06Z","title":"Text-driven Adaptation of Foundation Models for Few-shot Surgical\n Workflow Analysis","summary":" Purpose: Surgical workflow analysis is crucial for improving surgical\nefficiency and safety. However, previous studies rely heavily on large-scale\nannotated datasets, posing challenges in cost, scalability, and reliance on\nexpert annotations. To address this, we propose Surg-FTDA (Few-shot Text-driven\nAdaptation), designed to handle various surgical workflow analysis tasks with\nminimal paired image-label data.\n Methods: Our approach has two key components. First, Few-shot selection-based\nmodality alignment selects a small subset of images and aligns their embeddings\nwith text embeddings from the downstream task, bridging the modality gap.\nSecond, Text-driven adaptation leverages only text data to train a decoder,\neliminating the need for paired image-text data. This decoder is then applied\nto aligned image embeddings, enabling image-related tasks without explicit\nimage-text pairs.\n Results: We evaluate our approach to generative tasks (image captioning) and\ndiscriminative tasks (triplet recognition and phase recognition). Results show\nthat Surg-FTDA outperforms baselines and generalizes well across downstream\ntasks.\n Conclusion: We propose a text-driven adaptation approach that mitigates the\nmodality gap and handles multiple downstream tasks in surgical workflow\nanalysis, with minimal reliance on large annotated datasets. The code and\ndataset will be released in https://github.com/CAMMA-public/Surg-FTDA\n","authors":["Tingxuan Chen","Kun Yuan","Vinkle Srivastav","Nassir Navab","Nicolas Padoy"],"pdf_url":"https://arxiv.org/pdf/2501.09555v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.11142v2","updated":"2025-03-03T12:56:35Z","published":"2025-02-16T14:17:36Z","title":"NavRAG: Generating User Demand Instructions for Embodied Navigation\n through Retrieval-Augmented LLM","summary":" Vision-and-Language Navigation (VLN) is an essential skill for embodied\nagents, allowing them to navigate in 3D environments following natural language\ninstructions. High-performance navigation models require a large amount of\ntraining data, the high cost of manually annotating data has seriously hindered\nthis field. Therefore, some previous methods translate trajectory videos into\nstep-by-step instructions for expanding data, but such instructions do not\nmatch well with users' communication styles that briefly describe destinations\nor state specific needs. Moreover, local navigation trajectories overlook\nglobal context and high-level task planning. To address these issues, we\npropose NavRAG, a retrieval-augmented generation (RAG) framework that generates\nuser demand instructions for VLN. NavRAG leverages LLM to build a hierarchical\nscene description tree for 3D scene understanding from global layout to local\ndetails, then simulates various user roles with specific demands to retrieve\nfrom the scene tree, generating diverse instructions with LLM. We annotate over\n2 million navigation instructions across 861 scenes and evaluate the data\nquality and navigation performance of trained models.\n","authors":["Zihan Wang","Yaohui Zhu","Gim Hee Lee","Yachun Fan"],"pdf_url":"https://arxiv.org/pdf/2502.11142v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04591v2","updated":"2025-03-03T12:35:33Z","published":"2024-08-08T17:04:06Z","title":"HiLo: A Learning Framework for Generalized Category Discovery Robust to\n Domain Shifts","summary":" Generalized Category Discovery (GCD) is a challenging task in which, given a\npartially labelled dataset, models must categorize all unlabelled instances,\nregardless of whether they come from labelled categories or from new ones. In\nthis paper, we challenge a remaining assumption in this task: that all images\nshare the same domain. Specifically, we introduce a new task and method to\nhandle GCD when the unlabelled data also contains images from different domains\nto the labelled set. Our proposed `HiLo' networks extract High-level semantic\nand Low-level domain features, before minimizing the mutual information between\nthe representations. Our intuition is that the clusterings based on domain\ninformation and semantic information should be independent. We further extend\nour method with a specialized domain augmentation tailored for the GCD task, as\nwell as a curriculum learning approach. Finally, we construct a benchmark from\ncorrupted fine-grained datasets as well as a large-scale evaluation on\nDomainNet with real-world domain shifts, reimplementing a number of GCD\nbaselines in this setting. We demonstrate that HiLo outperforms SoTA category\ndiscovery models by a large margin on all evaluations.\n","authors":["Hongjun Wang","Sagar Vaze","Kai Han"],"pdf_url":"https://arxiv.org/pdf/2408.04591v2.pdf","comment":"v2: Accepted as a conference paper at ICLR 2025; Project page:\n https://github.com/Visual-AI/hilo/"},{"id":"http://arxiv.org/abs/2502.17941v2","updated":"2025-03-03T12:00:57Z","published":"2025-02-25T08:03:04Z","title":"Optimal Brain Apoptosis","summary":" The increasing complexity and parameter count of Convolutional Neural\nNetworks (CNNs) and Transformers pose challenges in terms of computational\nefficiency and resource demands. Pruning has been identified as an effective\nstrategy to address these challenges by removing redundant elements such as\nneurons, channels, or connections, thereby enhancing computational efficiency\nwithout heavily compromising performance. This paper builds on the foundational\nwork of Optimal Brain Damage (OBD) by advancing the methodology of parameter\nimportance estimation using the Hessian matrix. Unlike previous approaches that\nrely on approximations, we introduce Optimal Brain Apoptosis (OBA), a novel\npruning method that calculates the Hessian-vector product value directly for\neach parameter. By decomposing the Hessian matrix across network layers and\nidentifying conditions under which inter-layer Hessian submatrices are\nnon-zero, we propose a highly efficient technique for computing the\nsecond-order Taylor expansion of parameters. This approach allows for a more\nprecise pruning process, particularly in the context of CNNs and Transformers,\nas validated in our experiments including VGG19, ResNet32, ResNet50, and\nViT-B/16 on CIFAR10, CIFAR100 and Imagenet datasets. Our code is available at\nhttps://github.com/NEU-REAL/OBA.\n","authors":["Mingyuan Sun","Zheng Fang","Jiaxu Wang","Junjie Jiang","Delei Kong","Chenming Hu","Yuetong Fang","Renjing Xu"],"pdf_url":"https://arxiv.org/pdf/2502.17941v2.pdf","comment":"Accepted to ICLR 2025"},{"id":"http://arxiv.org/abs/2405.16195v3","updated":"2025-03-03T11:39:53Z","published":"2024-05-25T11:57:43Z","title":"Adaptive $Q$-Network: On-the-fly Target Selection for Deep Reinforcement\n Learning","summary":" Deep Reinforcement Learning (RL) is well known for being highly sensitive to\nhyperparameters, requiring practitioners substantial efforts to optimize them\nfor the problem at hand. This also limits the applicability of RL in real-world\nscenarios. In recent years, the field of automated Reinforcement Learning\n(AutoRL) has grown in popularity by trying to address this issue. However,\nthese approaches typically hinge on additional samples to select\nwell-performing hyperparameters, hindering sample-efficiency and practicality.\nFurthermore, most AutoRL methods are heavily based on already existing AutoML\nmethods, which were originally developed neglecting the additional challenges\ninherent to RL due to its non-stationarities. In this work, we propose a new\napproach for AutoRL, called Adaptive $Q$-Network (AdaQN), that is tailored to\nRL to take into account the non-stationarity of the optimization procedure\nwithout requiring additional samples. AdaQN learns several $Q$-functions, each\none trained with different hyperparameters, which are updated online using the\n$Q$-function with the smallest approximation error as a shared target. Our\nselection scheme simultaneously handles different hyperparameters while coping\nwith the non-stationarity induced by the RL optimization procedure and being\northogonal to any critic-based RL algorithm. We demonstrate that AdaQN is\ntheoretically sound and empirically validate it in MuJoCo control problems and\nAtari $2600$ games, showing benefits in sample-efficiency, overall performance,\nrobustness to stochasticity and training stability.\n","authors":["Théo Vincent","Fabian Wahren","Jan Peters","Boris Belousov","Carlo D'Eramo"],"pdf_url":"https://arxiv.org/pdf/2405.16195v3.pdf","comment":"Accepted at ICLR https://iclr.cc/virtual/2025/poster/28508"},{"id":"http://arxiv.org/abs/2410.11502v2","updated":"2025-03-03T11:38:11Z","published":"2024-10-15T11:15:03Z","title":"Offline Model-Based Optimization by Learning to Rank","summary":" Offline model-based optimization (MBO) aims to identify a design that\nmaximizes a black-box function using only a fixed, pre-collected dataset of\ndesigns and their corresponding scores. A common approach in offline MBO is to\ntrain a regression-based surrogate model by minimizing mean squared error (MSE)\nand then find the best design within this surrogate model by different\noptimizers (e.g., gradient ascent). However, a critical challenge is the risk\nof out-of-distribution errors, i.e., the surrogate model may typically\noverestimate the scores and mislead the optimizers into suboptimal regions.\nPrior works have attempted to address this issue in various ways, such as using\nregularization techniques and ensemble learning to enhance the robustness of\nthe model, but it still remains. In this paper, we argue that regression models\ntrained with MSE are not well-aligned with the primary goal of offline MBO,\nwhich is to select promising designs rather than to predict their scores\nprecisely. Notably, if a surrogate model can maintain the order of candidate\ndesigns based on their relative score relationships, it can produce the best\ndesigns even without precise predictions. To validate it, we conduct\nexperiments to compare the relationship between the quality of the final\ndesigns and MSE, finding that the correlation is really very weak. In contrast,\na metric that measures order-maintaining quality shows a significantly stronger\ncorrelation. Based on this observation, we propose learning a ranking-based\nmodel that leverages learning to rank techniques to prioritize promising\ndesigns based on their relative scores. We show that the generalization error\non ranking loss can be well bounded. Empirical results across diverse tasks\ndemonstrate the superior performance of our proposed ranking-based models than\ntwenty existing methods.\n","authors":["Rong-Xi Tan","Ke Xue","Shen-Huan Lyu","Haopu Shang","Yao Wang","Yaoyuan Wang","Sheng Fu","Chao Qian"],"pdf_url":"https://arxiv.org/pdf/2410.11502v2.pdf","comment":"ICLR 2025"},{"id":"http://arxiv.org/abs/2407.06057v2","updated":"2025-03-03T11:08:15Z","published":"2024-07-08T15:59:44Z","title":"Variational Best-of-N Alignment","summary":" Best-of-N (BoN) is a popular and effective algorithm for aligning language\nmodels to human preferences. The algorithm works as follows: at inference time,\nN samples are drawn from the language model, and the sample with the highest\nreward, as judged by a reward model, is returned as the output. Despite its\neffectiveness, BoN is computationally expensive; it reduces sampling throughput\nby a factor of N. To make BoN more efficient at inference time, one strategy is\nto fine-tune the language model to mimic what BoN does during inference. To\nachieve this, we derive the distribution induced by the BoN algorithm. We then\npropose to fine-tune the language model to minimize backward KL divergence to\nthe BoN distribution. Our approach is analogous to mean-field variational\ninference and, thus, we term it variational BoN (vBoN). To the extent this\nfine-tuning is successful and we end up with a good approximation, we have\nreduced the inference cost by a factor of N. Our experiments on controlled\ngeneration and summarization tasks show that BoN is the most effective\nalignment method, and our variational approximation to BoN achieves the closest\nperformance to BoN and surpasses models fine-tuned using the standard\nKL-constrained RL objective. In the controlled generation task, vBoN appears\nmore frequently on the Pareto frontier of reward and KL divergence compared to\nother alignment methods. In the summarization task, vBoN achieves high reward\nvalues across various sampling temperatures.\n","authors":["Afra Amini","Tim Vieira","Elliott Ash","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2407.06057v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.15285v2","updated":"2025-03-03T11:05:56Z","published":"2025-02-21T08:23:32Z","title":"Offload Rethinking by Cloud Assistance for Efficient Environmental Sound\n Recognition on LPWANs","summary":" Learning-based environmental sound recognition has emerged as a crucial\nmethod for ultra-low-power environmental monitoring in biological research and\ncity-scale sensing systems. These systems usually operate under limited\nresources and are often powered by harvested energy in remote areas. Recent\nefforts in on-device sound recognition suffer from low accuracy due to resource\nconstraints, whereas cloud offloading strategies are hindered by high\ncommunication costs. In this work, we introduce ORCA, a novel\nresource-efficient cloud-assisted environmental sound recognition system on\nbatteryless devices operating over the Low-Power Wide-Area Networks (LPWANs),\ntargeting wide-area audio sensing applications. We propose a cloud assistance\nstrategy that remedies the low accuracy of on-device inference while minimizing\nthe communication costs for cloud offloading. By leveraging a\nself-attention-based cloud sub-spectral feature selection method to facilitate\nefficient on-device inference, ORCA resolves three key challenges for\nresource-constrained cloud offloading over LPWANs: 1) high communication costs\nand low data rates, 2) dynamic wireless channel conditions, and 3) unreliable\noffloading. We implement ORCA on an energy-harvesting batteryless\nmicrocontroller and evaluate it in a real world urban sound testbed. Our\nresults show that ORCA outperforms state-of-the-art methods by up to $80\n\\times$ in energy savings and $220 \\times$ in latency reduction while\nmaintaining comparable accuracy.\n","authors":["Le Zhang","Quanling Zhao","Run Wang","Shirley Bian","Onat Gungor","Flavio Ponzina","Tajana Rosing"],"pdf_url":"https://arxiv.org/pdf/2502.15285v2.pdf","comment":"Accepted by The 23rd ACM Conference on Embedded Networked Sensor\n Systems (SenSys '25)"},{"id":"http://arxiv.org/abs/2411.12460v2","updated":"2025-03-03T10:35:27Z","published":"2024-11-19T12:36:02Z","title":"Exploring Iterative Controllable Summarization with Large Language\n Models","summary":" Large language models (LLMs) have demonstrated remarkable performance in\nabstractive summarization tasks. However, their ability to precisely control\nsummary attributes (e.g., length or topic) remains underexplored, limiting\ntheir adaptability to specific user preferences. In this paper, we\nsystematically explore the controllability of LLMs. To this end, we revisit\nsummary attribute measurements and introduce iterative evaluation metrics,\nfailure rate and average iteration count to precisely evaluate controllability\nof LLMs, rather than merely assessing errors. Our findings show that LLMs\nstruggle more with numerical attributes than with linguistic attributes. To\naddress this challenge, we propose a guide-to-explain framework (GTE) for\ncontrollable summarization. Our GTE framework enables the model to identify\nmisaligned attributes in the initial draft and guides it in self-explaining\nerrors in the previous output. By allowing the model to reflect on its\nmisalignment, GTE generates well-adjusted summaries that satisfy the desired\nattributes with robust effectiveness, requiring surprisingly fewer iterations\nthan other iterative approaches.\n","authors":["Sangwon Ryu","Heejin Do","Daehee Kim","Hwanjo Yu","Dongwoo Kim","Yunsu Kim","Gary Geunbae Lee","Jungseul Ok"],"pdf_url":"https://arxiv.org/pdf/2411.12460v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.21264v2","updated":"2025-03-03T10:35:23Z","published":"2025-02-28T17:40:45Z","title":"Foundation Models -- A Panacea for Artificial Intelligence in Pathology?","summary":" The role of artificial intelligence (AI) in pathology has evolved from aiding\ndiagnostics to uncovering predictive morphological patterns in whole slide\nimages (WSIs). Recently, foundation models (FMs) leveraging self-supervised\npre-training have been widely advocated as a universal solution for diverse\ndownstream tasks. However, open questions remain about their clinical\napplicability and generalization advantages over end-to-end learning using\ntask-specific (TS) models. Here, we focused on AI with clinical-grade\nperformance for prostate cancer diagnosis and Gleason grading. We present the\nlargest validation of AI for this task, using over 100,000 core needle biopsies\nfrom 7,342 patients across 15 sites in 11 countries. We compared two FMs with a\nfully end-to-end TS model in a multiple instance learning framework. Our\nfindings challenge assumptions that FMs universally outperform TS models. While\nFMs demonstrated utility in data-scarce scenarios, their performance converged\nwith - and was in some cases surpassed by - TS models when sufficient labeled\ntraining data were available. Notably, extensive task-specific training\nmarkedly reduced clinically significant misgrading, misdiagnosis of challenging\nmorphologies, and variability across different WSI scanners. Additionally, FMs\nused up to 35 times more energy than the TS model, raising concerns about their\nsustainability. Our results underscore that while FMs offer clear advantages\nfor rapid prototyping and research, their role as a universal solution for\nclinically applicable medical AI remains uncertain. For high-stakes clinical\napplications, rigorous validation and consideration of task-specific training\nremain critically important. We advocate for integrating the strengths of FMs\nand end-to-end learning to achieve robust and resource-efficient AI pathology\nsolutions fit for clinical use.\n","authors":["Nita Mulliqi","Anders Blilie","Xiaoyi Ji","Kelvin Szolnoky","Henrik Olsson","Sol Erika Boman","Matteo Titus","Geraldine Martinez Gonzalez","Julia Anna Mielcarz","Masi Valkonen","Einar Gudlaugsson","Svein R. Kjosavik","José Asenjo","Marcello Gambacorta","Paolo Libretti","Marcin Braun","Radzislaw Kordek","Roman Łowicki","Kristina Hotakainen","Päivi Väre","Bodil Ginnerup Pedersen","Karina Dalsgaard Sørensen","Benedicte Parm Ulhøi","Pekka Ruusuvuori","Brett Delahunt","Hemamali Samaratunga","Toyonori Tsuzuki","Emilius A. M. Janssen","Lars Egevad","Martin Eklund","Kimmo Kartasalo"],"pdf_url":"https://arxiv.org/pdf/2502.21264v2.pdf","comment":"50 pages, 15 figures and an appendix (study protocol) which is\n previously published, see https://doi.org/10.1101/2024.07.04.24309948;\n updated authors list format"},{"id":"http://arxiv.org/abs/2502.15425v3","updated":"2025-03-03T10:35:14Z","published":"2025-02-21T12:52:16Z","title":"TAG: A Decentralized Framework for Multi-Agent Hierarchical\n Reinforcement Learning","summary":" Hierarchical organization is fundamental to biological systems and human\nsocieties, yet artificial intelligence systems often rely on monolithic\narchitectures that limit adaptability and scalability. Current hierarchical\nreinforcement learning (HRL) approaches typically restrict hierarchies to two\nlevels or require centralized training, which limits their practical\napplicability. We introduce TAME Agent Framework (TAG), a framework for\nconstructing fully decentralized hierarchical multi-agent systems.TAG enables\nhierarchies of arbitrary depth through a novel LevelEnv concept, which\nabstracts each hierarchy level as the environment for the agents above it. This\napproach standardizes information flow between levels while preserving loose\ncoupling, allowing for seamless integration of diverse agent types. We\ndemonstrate the effectiveness of TAG by implementing hierarchical architectures\nthat combine different RL agents across multiple levels, achieving improved\nperformance over classical multi-agent RL baselines on standard benchmarks. Our\nresults show that decentralized hierarchical organization enhances both\nlearning speed and final performance, positioning TAG as a promising direction\nfor scalable multi-agent systems.\n","authors":["Giuseppe Paolo","Abdelhakim Benechehab","Hamza Cherkaoui","Albert Thomas","Balázs Kégl"],"pdf_url":"https://arxiv.org/pdf/2502.15425v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.21201v2","updated":"2025-03-03T10:32:20Z","published":"2025-02-28T16:18:57Z","title":"The PanAf-FGBG Dataset: Understanding the Impact of Backgrounds in\n Wildlife Behaviour Recognition","summary":" Computer vision analysis of camera trap video footage is essential for\nwildlife conservation, as captured behaviours offer some of the earliest\nindicators of changes in population health. Recently, several high-impact\nanimal behaviour datasets and methods have been introduced to encourage their\nuse; however, the role of behaviour-correlated background information and its\nsignificant effect on out-of-distribution generalisation remain unexplored. In\nresponse, we present the PanAf-FGBG dataset, featuring 20 hours of wild\nchimpanzee behaviours, recorded at over 350 individual camera locations.\nUniquely, it pairs every video with a chimpanzee (referred to as a foreground\nvideo) with a corresponding background video (with no chimpanzee) from the same\ncamera location. We present two views of the dataset: one with overlapping\ncamera locations and one with disjoint locations. This setup enables, for the\nfirst time, direct evaluation of in-distribution and out-of-distribution\nconditions, and for the impact of backgrounds on behaviour recognition models\nto be quantified. All clips come with rich behavioural annotations and metadata\nincluding unique camera IDs and detailed textual scene descriptions.\nAdditionally, we establish several baselines and present a highly effective\nlatent-space normalisation technique that boosts out-of-distribution\nperformance by +5.42% mAP for convolutional and +3.75% mAP for\ntransformer-based models. Finally, we provide an in-depth analysis on the role\nof backgrounds in out-of-distribution behaviour recognition, including the so\nfar unexplored impact of background durations (i.e., the count of background\nframes within foreground videos).\n","authors":["Otto Brookes","Maksim Kukushkin","Majid Mirmehdi","Colleen Stephens","Paula Dieguez","Thurston C. Hicks","Sorrel Jones","Kevin Lee","Maureen S. McCarthy","Amelia Meier","Emmanuelle Normand","Erin G. Wessling","Roman M. Wittig","Kevin Langergraber","Klaus Zuberbühler","Lukas Boesch","Thomas Schmid","Mimi Arandjelovic","Hjalmar Kühl","Tilo Burghardt"],"pdf_url":"https://arxiv.org/pdf/2502.21201v2.pdf","comment":"Accepted at the IEEE / CVF Computer Vision and Pattern Recognition\n Conference 2025"},{"id":"http://arxiv.org/abs/2411.06916v2","updated":"2025-03-03T10:22:24Z","published":"2024-11-11T12:19:28Z","title":"Slowing Down Forgetting in Continual Learning","summary":" A common challenge in continual learning (CL) is catastrophic forgetting,\nwhere the performance on old tasks drops after new, additional tasks are\nlearned. In this paper, we propose a novel framework called ReCL to slow down\nforgetting in CL. Our framework exploits an implicit bias of gradient-based\nneural networks due to which these converge to margin maximization points. Such\nconvergence points allow us to reconstruct old data from previous tasks, which\nwe then combine with the current training data. Our framework is flexible and\ncan be applied on top of existing, state-of-the-art CL methods. We further\ndemonstrate the performance gain from our framework across a large series of\nexperiments, including two challenging CL scenarios (class incremental and\ndomain incremental learning), different datasets (MNIST, CIFAR10,\nTinyImagenet), and different network architectures. Across all experiments, we\nfind large performance gains through ReCL. To the best of our knowledge, our\nframework is the first to address catastrophic forgetting by leveraging models\nin CL as their own memory buffers.\n","authors":["Pascal Janetzky","Tobias Schlagenhauf","Stefan Feuerriegel"],"pdf_url":"https://arxiv.org/pdf/2411.06916v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.21123v2","updated":"2025-03-03T10:00:03Z","published":"2025-02-28T14:57:33Z","title":"Causality Is Key to Understand and Balance Multiple Goals in Trustworthy\n ML and Foundation Models","summary":" Ensuring trustworthiness in machine learning (ML) systems is crucial as they\nbecome increasingly embedded in high-stakes domains. This paper advocates for\nintegrating causal methods into machine learning to navigate the trade-offs\namong key principles of trustworthy ML, including fairness, privacy,\nrobustness, accuracy, and explainability. While these objectives should ideally\nbe satisfied simultaneously, they are often addressed in isolation, leading to\nconflicts and suboptimal solutions. Drawing on existing applications of\ncausality in ML that successfully align goals such as fairness and accuracy or\nprivacy and robustness, this paper argues that a causal approach is essential\nfor balancing multiple competing objectives in both trustworthy ML and\nfoundation models. Beyond highlighting these trade-offs, we examine how\ncausality can be practically integrated into ML and foundation models, offering\nsolutions to enhance their reliability and interpretability. Finally, we\ndiscuss the challenges, limitations, and opportunities in adopting causal\nframeworks, paving the way for more accountable and ethically sound AI systems.\n","authors":["Ruta Binkyte","Ivaxi Sheth","Zhijing Jin","Mohammad Havaei","Bernhard Schölkopf","Mario Fritz"],"pdf_url":"https://arxiv.org/pdf/2502.21123v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.11355v2","updated":"2025-03-03T09:45:24Z","published":"2025-02-17T02:11:17Z","title":"\"Nuclear Deployed!\": Analyzing Catastrophic Risks in Decision-making of\n Autonomous LLM Agents","summary":" Large language models (LLMs) are evolving into autonomous decision-makers,\nraising concerns about catastrophic risks in high-stakes scenarios,\nparticularly in Chemical, Biological, Radiological and Nuclear (CBRN) domains.\nBased on the insight that such risks can originate from trade-offs between the\nagent's Helpful, Harmlessness and Honest (HHH) goals, we build a novel\nthree-stage evaluation framework, which is carefully constructed to effectively\nand naturally expose such risks. We conduct 14,400 agentic simulations across\n12 advanced LLMs, with extensive experiments and analysis. Results reveal that\nLLM agents can autonomously engage in catastrophic behaviors and deception,\nwithout being deliberately induced. Furthermore, stronger reasoning abilities\noften increase, rather than mitigate, these risks. We also show that these\nagents can violate instructions and superior commands. On the whole, we\nempirically prove the existence of catastrophic risks in autonomous LLM agents.\nWe will release our code upon request.\n","authors":["Rongwu Xu","Xiaojian Li","Shuo Chen","Wei Xu"],"pdf_url":"https://arxiv.org/pdf/2502.11355v2.pdf","comment":"Please visit https://llm-catastrophic-risks.github.io for a quick\n tour of our project"},{"id":"http://arxiv.org/abs/2407.14651v3","updated":"2025-03-03T09:31:01Z","published":"2024-07-19T20:05:10Z","title":"Improving Representation of High-frequency Components for Medical Visual\n Foundation Models","summary":" Foundation models have recently attracted significant attention for their\nimpressive generalizability across diverse downstream tasks. However, these\nmodels are demonstrated to exhibit great limitations in representing\nhigh-frequency components and fine-grained details. In many medical imaging\ntasks, the precise representation of such information is crucial due to the\ninherently intricate anatomical structures, sub-visual features, and complex\nboundaries involved. Consequently, the limited representation of prevalent\nfoundation models can result in significant performance degradation or even\nfailure in these tasks. To address these challenges, we propose a novel\npretraining strategy, named Frequency-advanced Representation Autoencoder\n(Frepa). Through high-frequency masking and low-frequency perturbation combined\nwith adversarial learning, Frepa encourages the encoder to effectively\nrepresent and preserve high-frequency components in the image embeddings.\nAdditionally, we introduce an innovative histogram-equalized image masking\nstrategy, extending the Masked Autoencoder approach beyond ViT to other\narchitectures such as Swin Transformer and convolutional networks. We develop\nFrepa across nine medical modalities and validate it on 32 downstream tasks for\nboth 2D images and 3D volume data. Without fine-tuning, Frepa can outperform\nother self-supervised pretraining methods and, in some cases, even surpasses\ntask-specific trained models. This improvement is particularly significant for\ntasks involving fine-grained details, such as achieving up to a +15% increase\nin DSC for retina vessel segmentation and a +7% increase in IoU for lung nodule\ndetection. Further experiments quantitatively reveal that Frepa enables\nsuperior high-frequency representations and preservation in the embeddings,\nunderscoring its potential for developing more generalized and universal\nmedical image foundation models.\n","authors":["Yuetan Chu","Yilan Zhang","Zhongyi Han","Changchun Yang","Longxi Zhou","Gongning Luo","Chao Huang","Xin Gao"],"pdf_url":"https://arxiv.org/pdf/2407.14651v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09911v2","updated":"2025-03-03T09:21:11Z","published":"2024-02-15T12:20:02Z","title":"Enhancing Large Language Models with Pseudo- and Multisource- Knowledge\n Graphs for Open-ended Question Answering","summary":" Mitigating the hallucinations of Large Language Models is a crucial task.\nAlthough some existing methods employ self-enhancement techniques, they fall\nshort of effectively addressing unknown factual hallucinations. Meanwhile,\nKnowledge Graph (KG) enhancement approaches fail to address the generalization\nacross different KG sources and the enhancement of open-ended answer questions\nsimultaneously. To tackle these limitations, we propose a framework that\ncombines Pseudo-Graph Generation and Atomic Knowledge Verification (PG\\&AKV).\nEnhancement of open-ended question-answering begins with leveraging the\nPseudo-Graph Generation to provide the related knowledge framework.\nSubsequently, Atomic Knowledge Verification utilizes atomic-level knowledge\nquerying and verification to achieve generalizability under different KG\nsources. Compared to the baseline, this approach yields a minimum improvement\nof 11.5 in the ROUGE-L score for open-ended questions. For precise-answered\nquestions, we observe a minimum accuracy improvement of 7.5%. Moreover, PG\\&AKV\nalso exhibits generalizability across different KG sources. Utilizing KG\ndifferent from the question sources, PG\\&AKV can even achieve at least a 3.5 %\nperformance improvement. In summary, our results pave the way for enhancing\nLLMs by incorporating Pseudo- and Multisource-KGs, particularly in the filed of\nopen-ended questions.\n","authors":["Jiaxiang Liu","Tong Zhou","Yubo Chen","Kang Liu","Jun Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.09911v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.21228v2","updated":"2025-03-03T09:11:46Z","published":"2025-02-28T16:59:30Z","title":"ECLeKTic: a Novel Challenge Set for Evaluation of Cross-Lingual\n Knowledge Transfer","summary":" To achieve equitable performance across languages, multilingual large\nlanguage models (LLMs) must be able to abstract knowledge beyond the language\nin which it was acquired. However, the current literature lacks reliable ways\nto measure LLMs' capability of cross-lingual knowledge transfer. To that end,\nwe present ECLeKTic, a multilingual closed-book QA (CBQA) dataset that\nEvaluates Cross-Lingual Knowledge Transfer in a simple, black-box manner. We\ndetected information with uneven coverage across languages by controlling for\npresence and absence of Wikipedia articles in 12 languages. We generated\nknowledge-seeking questions in a source language, for which the answer appears\nin a relevant Wikipedia article and translated them to all other 11 languages,\nfor which the respective Wikipedias lack equivalent articles. Assuming that\nWikipedia reflects the prominent knowledge in the LLM's training data, to solve\nECLeKTic's CBQA task the model is required to transfer knowledge between\nlanguages. Experimenting with 8 LLMs, we show that SOTA models struggle to\neffectively share knowledge across, languages even if they can predict the\nanswer well for queries in the same language the knowledge was acquired in.\n","authors":["Omer Goldman","Uri Shaham","Dan Malkin","Sivan Eiger","Avinatan Hassidim","Yossi Matias","Joshua Maynez","Adi Mayrav Gilady","Jason Riesa","Shruti Rijhwani","Laura Rimell","Idan Szpektor","Reut Tsarfaty","Matan Eyal"],"pdf_url":"https://arxiv.org/pdf/2502.21228v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16751v3","updated":"2025-03-03T09:07:59Z","published":"2025-01-28T07:08:20Z","title":"HiBug2: Efficient and Interpretable Error Slice Discovery for\n Comprehensive Model Debugging","summary":" Despite the significant success of deep learning models in computer vision,\nthey often exhibit systematic failures on specific data subsets, known as error\nslices. Identifying and mitigating these error slices is crucial to enhancing\nmodel robustness and reliability in real-world scenarios. In this paper, we\nintroduce HiBug2, an automated framework for error slice discovery and model\nrepair. HiBug2 first generates task-specific visual attributes to highlight\ninstances prone to errors through an interpretable and structured process. It\nthen employs an efficient slice enumeration algorithm to systematically\nidentify error slices, overcoming the combinatorial challenges that arise\nduring slice exploration. Additionally, HiBug2 extends its capabilities by\npredicting error slices beyond the validation set, addressing a key limitation\nof prior approaches. Extensive experiments across multiple domains, including\nimage classification, pose estimation, and object detection - show that HiBug2\nnot only improves the coherence and precision of identified error slices but\nalso significantly enhances the model repair capabilities.\n","authors":["Muxi Chen","Chenchen Zhao","Qiang Xu"],"pdf_url":"https://arxiv.org/pdf/2501.16751v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.16890v2","updated":"2025-03-03T08:58:48Z","published":"2025-02-24T06:40:33Z","title":"ReFocus: Reinforcing Mid-Frequency and Key-Frequency Modeling for\n Multivariate Time Series Forecasting","summary":" Recent advancements have progressively incorporated frequency-based\ntechniques into deep learning models, leading to notable improvements in\naccuracy and efficiency for time series analysis tasks. However, the\nMid-Frequency Spectrum Gap in the real-world time series, where the energy is\nconcentrated at the low-frequency region while the middle-frequency band is\nnegligible, hinders the ability of existing deep learning models to extract the\ncrucial frequency information. Additionally, the shared Key-Frequency in\nmultivariate time series, where different time series share indistinguishable\nfrequency patterns, is rarely exploited by existing literature. This work\nintroduces a novel module, Adaptive Mid-Frequency Energy Optimizer, based on\nconvolution and residual learning, to emphasize the significance of\nmid-frequency bands. We also propose an Energy-based Key-Frequency Picking\nBlock to capture shared Key-Frequency, which achieves superior inter-series\nmodeling performance with fewer parameters. A novel Key-Frequency Enhanced\nTraining strategy is employed to further enhance Key-Frequency modeling, where\nspectral information from other channels is randomly introduced into each\nchannel. Our approach advanced multivariate time series forecasting on the\nchallenging Traffic, ECL, and Solar benchmarks, reducing MSE by 4%, 6%, and 5%\ncompared to the previous SOTA iTransformer. Code is available at this GitHub\nRepository: https://github.com/Levi-Ackman/ReFocus.\n","authors":["Guoqi Yu","Yaoming Li","Juncheng Wang","Xiaoyu Guo","Angelica I. Aviles-Rivero","Tong Yang","Shujun Wang"],"pdf_url":"https://arxiv.org/pdf/2502.16890v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2502.08679v3","updated":"2025-03-03T08:50:28Z","published":"2025-02-12T08:56:35Z","title":"Deep Learning-Driven Malware Classification with API Call Sequence\n Analysis and Concept Drift Handling","summary":" Malware classification in dynamic environments presents a significant\nchallenge due to concept drift, where the statistical properties of malware\ndata evolve over time, complicating detection efforts. To address this issue,\nwe propose a deep learning framework enhanced with a genetic algorithm to\nimprove malware classification accuracy and adaptability. Our approach\nincorporates mutation operations and fitness score evaluations within genetic\nalgorithms to continuously refine the deep learning model, ensuring robustness\nagainst evolving malware threats. Experimental results demonstrate that this\nhybrid method significantly enhances classification performance and\nadaptability, outperforming traditional static models. Our proposed approach\noffers a promising solution for real-time malware classification in\never-changing cybersecurity landscapes.\n","authors":["Bishwajit Prasad Gond","Durga Prasad Mohapatra"],"pdf_url":"https://arxiv.org/pdf/2502.08679v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00724v3","updated":"2025-03-03T07:53:32Z","published":"2024-08-01T17:16:04Z","title":"Inference Scaling Laws: An Empirical Analysis of Compute-Optimal\n Inference for Problem-Solving with Language Models","summary":" While the scaling laws of large language models (LLMs) training have been\nextensively studied, optimal inference configurations of LLMs remain\nunderexplored. We study inference scaling laws (aka test-time scaling laws) and\ncompute-optimal inference, focusing on the trade-offs between model sizes and\ngenerating additional tokens with different inference strategies. As a first\nstep towards understanding and designing compute-optimal inference methods, we\nstudied cost-performance trade-offs for inference strategies such as greedy\nsearch, majority voting, best-of-$n$, weighted voting, and two different tree\nsearch algorithms, using different model sizes and compute budgets. Our\nfindings suggest that scaling inference compute with inference strategies can\nbe more computationally efficient than scaling model parameters. Additionally,\nsmaller models combined with advanced inference algorithms offer Pareto-optimal\ntrade-offs in cost and performance. For example, the Llemma-7B model, when\npaired with our novel tree search algorithm, consistently outperforms the\nLlemma-34B model across all tested inference strategies on the MATH benchmark.\nWe hope these insights contribute to a deeper understanding of inference\nscaling laws (test-time scaling laws) for LLMs.\n","authors":["Yangzhen Wu","Zhiqing Sun","Shanda Li","Sean Welleck","Yiming Yang"],"pdf_url":"https://arxiv.org/pdf/2408.00724v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.20429v2","updated":"2025-03-03T07:46:41Z","published":"2025-02-27T14:04:02Z","title":"Will AI replace Software Engineers? Do not hold your breath","summary":" Artificial Intelligence (AI) technology such as Large Language Models (LLMs)\nhave become extremely popular in creating code. This has led to the conjecture\nthat future software jobs will be exclusively conducted by LLMs, and the\nsoftware industry will cease to exist. But software engineering is much more\nthan producing code -- notably, \\emph{maintaining} large software and keeping\nit reliable is a major part of software engineering, which LLMs are not yet\ncapable of.\n","authors":["Abhik Roychoudhury","Andreas Zeller"],"pdf_url":"https://arxiv.org/pdf/2502.20429v2.pdf","comment":"3 pages"},{"id":"http://arxiv.org/abs/2409.14866v5","updated":"2025-03-03T07:25:21Z","published":"2024-09-23T10:03:09Z","title":"PAPILLON: Efficient and Stealthy Fuzz Testing-Powered Jailbreaks for\n LLMs","summary":" Large Language Models (LLMs) have excelled in various tasks but are still\nvulnerable to jailbreaking attacks, where attackers create jailbreak prompts to\nmislead the model to produce harmful or offensive content. Current jailbreak\nmethods either rely heavily on manually crafted templates, which pose\nchallenges in scalability and adaptability, or struggle to generate\nsemantically coherent prompts, making them easy to detect. Additionally, most\nexisting approaches involve lengthy prompts, leading to higher query costs. In\nthis paper, to remedy these challenges, we introduce a novel jailbreaking\nattack framework called PAPILLON, which is an automated, black-box jailbreaking\nattack framework that adapts the black-box fuzz testing approach with a series\nof customized designs. Instead of relying on manually crafted\ntemplates,PAPILLON starts with an empty seed pool, removing the need to search\nfor any related jailbreaking templates. We also develop three novel\nquestion-dependent mutation strategies using an LLM helper to generate prompts\nthat maintain semantic coherence while significantly reducing their length.\nAdditionally, we implement a two-level judge module to accurately detect\ngenuine successful jailbreaks. We evaluated PAPILLON on 7 representative LLMs\nand compared it with 5 state-of-the-art jailbreaking attack strategies. For\nproprietary LLM APIs, such as GPT-3.5 turbo, GPT-4, and Gemini-Pro, PAPILLONs\nachieves attack success rates of over 90%, 80%, and 74%, respectively,\nexceeding existing baselines by more than 60\\%. Additionally, PAPILLON can\nmaintain high semantic coherence while significantly reducing the length of\njailbreak prompts. When targeting GPT-4, PAPILLON can achieve over 78% attack\nsuccess rate even with 100 tokens. Moreover, PAPILLON demonstrates\ntransferability and is robust to state-of-the-art defenses. Code:\nhttps://github.com/aaFrostnova/Papillon\n","authors":["Xueluan Gong","Mingzhe Li","Yilin Zhang","Fengyuan Ran","Chen Chen","Yanjiao Chen","Qian Wang","Kwok-Yan Lam"],"pdf_url":"https://arxiv.org/pdf/2409.14866v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02683v2","updated":"2025-03-03T07:20:54Z","published":"2024-10-03T17:08:52Z","title":"DailyDilemmas: Revealing Value Preferences of LLMs with Quandaries of\n Daily Life","summary":" As users increasingly seek guidance from LLMs for decision-making in daily\nlife, many of these decisions are not clear-cut and depend significantly on the\npersonal values and ethical standards of people. We present DailyDilemmas, a\ndataset of 1,360 moral dilemmas encountered in everyday life. Each dilemma\npresents two possible actions, along with affected parties and relevant human\nvalues for each action. Based on these dilemmas, we gather a repository of\nhuman values covering diverse everyday topics, such as interpersonal\nrelationships, workplace, and environmental issues. With DailyDilemmas, we\nevaluate LLMs on these dilemmas to determine what action they will choose and\nthe values represented by these action choices. Then, we analyze values through\nthe lens of five theoretical frameworks inspired by sociology, psychology, and\nphilosophy, including the World Values Survey, Moral Foundations Theory,\nMaslow's Hierarchy of Needs, Aristotle's Virtues, and Plutchik's Wheel of\nEmotions. For instance, we find LLMs are most aligned with self-expression over\nsurvival in World Values Survey and care over loyalty in Moral Foundations\nTheory. Interestingly, we find substantial preference differences in models for\nsome core values. For example, for truthfulness, Mixtral-8x7B neglects it by\n9.7% while GPT-4-turbo selects it by 9.4%. We also study the recent guidance\nreleased by OpenAI (ModelSpec), and Anthropic (Constitutional AI) to understand\nhow their designated principles reflect their models' actual value\nprioritization when facing nuanced moral reasoning in daily-life settings.\nFinally, we find that end users cannot effectively steer such prioritization\nusing system prompts.\n","authors":["Yu Ying Chiu","Liwei Jiang","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2410.02683v2.pdf","comment":"Accepted into ICLR 2025 (spotlight)"},{"id":"http://arxiv.org/abs/2501.02497v2","updated":"2025-03-03T07:16:16Z","published":"2025-01-05T10:24:20Z","title":"Test-Time Compute: from System-1 Thinking to System-2 Thinking","summary":" The remarkable performance of the o1 model in complex reasoning demonstrates\nthat test-time compute scaling can further unlock the model's potential,\nenabling powerful System-2 thinking. However, there is still a lack of\ncomprehensive surveys for test-time compute scaling. We trace the concept of\ntest-time compute back to System-1 models. In System-1 models, test-time\ncompute addresses distribution shifts and improves robustness and\ngeneralization through parameter updating, input modification, representation\nediting, and output calibration. In System-2 models, it enhances the model's\nreasoning ability to solve complex problems through repeated sampling,\nself-correction, and tree search. We organize this survey according to the\ntrend of System-1 to System-2 thinking, highlighting the key role of test-time\ncompute in the transition from System-1 models to weak System-2 models, and\nthen to strong System-2 models. We also point out a few possible future\ndirections.\n","authors":["Yixin Ji","Juntao Li","Hai Ye","Kaixin Wu","Kai Yao","Jia Xu","Linjian Mo","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.02497v2.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2502.18874v2","updated":"2025-03-03T07:13:12Z","published":"2025-02-26T06:31:45Z","title":"Learning to Align Multi-Faceted Evaluation: A Unified and Robust\n Framework","summary":" Large Language Models (LLMs) are being used more and more extensively for\nautomated evaluation in various scenarios. Previous studies have attempted to\nfine-tune open-source LLMs to replicate the evaluation explanations and\njudgments of powerful proprietary models, such as GPT-4. However, these methods\nare largely limited to text-based analyses under predefined general criteria,\nresulting in reduced adaptability for unseen instructions and demonstrating\ninstability in evaluating adherence to quantitative and structural constraints.\nTo address these limitations, we propose a novel evaluation framework, ARJudge,\nthat adaptively formulates evaluation criteria and synthesizes both text-based\nand code-driven analyses to evaluate LLM responses. ARJudge consists of two\ncomponents: a fine-tuned Analyzer that generates multi-faceted evaluation\nanalyses and a tuning-free Refiner that combines and refines all analyses to\nmake the final judgment. We construct a Composite Analysis Corpus that\nintegrates tasks for evaluation criteria generation alongside text-based and\ncode-driven analysis generation to train the Analyzer. Our results demonstrate\nthat ARJudge outperforms existing fine-tuned evaluators in effectiveness and\nrobustness. Furthermore, it demonstrates the importance of multi-faceted\nevaluation and code-driven analyses in enhancing evaluation capabilities.\n","authors":["Kaishuai Xu","Tiezheng Yu","Wenjun Hou","Yi Cheng","Liangyou Li","Xin Jiang","Lifeng Shang","Qun Liu","Wenjie Li"],"pdf_url":"https://arxiv.org/pdf/2502.18874v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.06638v3","updated":"2025-03-03T07:09:42Z","published":"2024-10-09T07:43:38Z","title":"Subtle Errors Matter: Preference Learning via Error-injected\n Self-editing","summary":" Large Language Models (LLMs) have exhibited strong mathematical reasoning\nprowess, tackling tasks ranging from basic arithmetic to advanced\ncompetition-level problems. However, frequently occurring subtle yet critical\nerrors, such as miscalculations or incorrect substitutions, limit the LLMs'\nfull potential. Existing studies to improve mathematical ability typically\ninvolve applying preference learning to step-wise solution pairs. Although\nthese methods leverage samples of varying granularity to mitigate reasoning\nerrors, they overlook critical subtle errors. In this work, we propose a novel\npreference learning framework called eRror-Injected Self-Editing (RISE), which\ninjects predefined subtle errors into pivotal tokens in reasoning or\ncomputation steps to construct hard pairs for error mitigation. In detail, RISE\nuses the LLM itself to edit a small number of tokens in the solution, injecting\ndesigned subtle errors. Then, pairs composed of self-edited solutions and their\ncorresponding correct ones, along with pairs of correct and incorrect solutions\nobtained through sampling, are used together for subtle error-aware DPO\ntraining. Compared with other preference learning methods, RISE further refines\nthe training objective without requiring fine-grained sampling or preference\nannotation. Extensive experiments validate the effectiveness of RISE, with\npreference learning on Qwen2-7B-Instruct yielding notable improvements of 3.0%\non GSM8K and 7.9% on MATH with only 4.5K training samples. Moreover, the effect\nof error mitigation extends from mathematical reasoning to logical reasoning\nand code generation.\n","authors":["Kaishuai Xu","Tiezheng Yu","Wenjun Hou","Yi Cheng","Chak Tou Leong","Liangyou Li","Xin Jiang","Lifeng Shang","Qun Liu","Wenjie Li"],"pdf_url":"https://arxiv.org/pdf/2410.06638v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03636v3","updated":"2025-03-03T06:56:29Z","published":"2024-03-06T11:48:08Z","title":"SheetAgent: Towards A Generalist Agent for Spreadsheet Reasoning and\n Manipulation via Large Language Models","summary":" Spreadsheets are ubiquitous across the World Wide Web, playing a critical\nrole in enhancing work efficiency across various domains. Large language model\n(LLM) has been recently attempted for automatic spreadsheet manipulation but\nhas not yet been investigated in complicated and realistic tasks where\nreasoning challenges exist (e.g., long horizon manipulation with multi-step\nreasoning and ambiguous requirements). To bridge the gap with the real-world\nrequirements, we introduce SheetRM, a benchmark featuring long-horizon and\nmulti-category tasks with reasoning-dependent manipulation caused by real-life\nchallenges. To mitigate the above challenges, we further propose SheetAgent, a\nnovel autonomous agent that utilizes the power of LLMs. SheetAgent consists of\nthree collaborative modules: Planner, Informer, and Retriever, achieving both\nadvanced reasoning and accurate manipulation over spreadsheets without human\ninteraction through iterative task reasoning and reflection. Extensive\nexperiments demonstrate that SheetAgent delivers 20--40\\% pass rate\nimprovements on multiple benchmarks over baselines, achieving enhanced\nprecision in spreadsheet manipulation and demonstrating superior table\nreasoning abilities. More details and visualizations are available at the\nproject website: https://sheetagent.github.io/. The datasets and source code\nare available at https://anonymous.4open.science/r/SheetAgent.\n","authors":["Yibin Chen","Yifu Yuan","Zeyu Zhang","Yan Zheng","Jinyi Liu","Fei Ni","Jianye Hao","Hangyu Mao","Fuzheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.03636v3.pdf","comment":"Accepted by International World Wide Web Conference (WWW) 2025 (oral)"},{"id":"http://arxiv.org/abs/2502.07190v2","updated":"2025-03-03T06:50:25Z","published":"2025-02-11T02:31:09Z","title":"Understanding LLMs' Fluid Intelligence Deficiency: An Analysis of the\n ARC Task","summary":" While LLMs have exhibited strong performance on various NLP tasks, it is\nnoteworthy that most of these tasks rely on utilizing the vast amount of\nknowledge encoded in LLMs' parameters, rather than solving new problems without\nprior knowledge. In cognitive research, the latter ability is referred to as\nfluid intelligence, which is considered to be critical for assessing human\nintelligence. Recent research on fluid intelligence assessments has highlighted\nsignificant deficiencies in LLMs' abilities. In this paper, we analyze the\nchallenges LLMs face in demonstrating fluid intelligence through controlled\nexperiments, using the most representative ARC task as an example. Our study\nrevealed three major limitations in existing LLMs: limited ability for skill\ncomposition, unfamiliarity with abstract input formats, and the intrinsic\ndeficiency of left-to-right decoding. Our data and code can be found in\nhttps://wujunjie1998.github.io/araoc-benchmark.github.io/.\n","authors":["Junjie Wu","Mo Yu","Lemao Liu","Dit-Yan Yeung","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2502.07190v2.pdf","comment":"22 pages, 9 figures, accepted by NAACL 2025 main conference"},{"id":"http://arxiv.org/abs/2501.12296v2","updated":"2025-03-03T06:45:12Z","published":"2025-01-21T17:03:06Z","title":"RALAD: Bridging the Real-to-Sim Domain Gap in Autonomous Driving with\n Retrieval-Augmented Learning","summary":" In the pursuit of robust autonomous driving systems, models trained on\nreal-world datasets often struggle to adapt to new environments, particularly\nwhen confronted with corner cases such as extreme weather conditions.\nCollecting these corner cases in the real world is non-trivial, which\nnecessitates the use of simulators for validation. However,the high\ncomputational cost and the domain gap in data distribution have hindered the\nseamless transition between real and simulated driving scenarios. To tackle\nthis challenge, we propose Retrieval-Augmented Learning for Autonomous Driving\n(RALAD), a novel framework designed to bridge the real-to-sim gap at a low\ncost. RALAD features three primary designs, including (1) domain adaptation via\nan enhanced Optimal Transport (OT) method that accounts for both individual and\ngrouped image distances, (2) a simple and unified framework that can be applied\nto various models, and (3) efficient fine-tuning techniques that freeze the\ncomputationally expensive layers while maintaining robustness. Experimental\nresults demonstrate that RALAD compensates for the performance degradation in\nsimulated environments while maintaining accuracy in real-world scenarios\nacross three different models. Taking Cross View as an example, the mIOU and\nmAP metrics in real-world scenarios remain stable before and after RALAD\nfine-tuning, while in simulated environments,the mIOU and mAP metrics are\nimproved by 10.30% and 12.29%, respectively. Moreover, the re-training cost of\nour approach is reduced by approximately 88.1%. Our code is available at\nhttps://github.com/JiachengZuo/RALAD.git.\n","authors":["Jiacheng Zuo","Haibo Hu","Zikang Zhou","Yufei Cui","Ziquan Liu","Jianping Wang","Nan Guan","Jin Wang","Chun Jason Xue"],"pdf_url":"https://arxiv.org/pdf/2501.12296v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15598v2","updated":"2025-03-03T06:39:17Z","published":"2024-12-20T06:42:58Z","title":"Long-Term EEG Partitioning for Seizure Onset Detection","summary":" Deep learning models have recently shown great success in classifying\nepileptic patients using EEG recordings. Unfortunately, classification-based\nmethods lack a sound mechanism to detect the onset of seizure events. In this\nwork, we propose a two-stage framework, SODor, that explicitly models seizure\nonset through a novel task formulation of subsequence clustering. Given an EEG\nsequence, the framework first learns a set of second-level embeddings with\nlabel supervision. It then employs model-based clustering to explicitly capture\nlong-term temporal dependencies in EEG sequences and identify meaningful\nsubsequences. Epochs within a subsequence share a common cluster assignment\n(normal or seizure), with cluster or state transitions representing successful\nonset detections. Extensive experiments on three datasets demonstrate that our\nmethod can correct misclassifications, achieving 5\\%-11\\% classification\nimprovements over other baselines and accurately detecting seizure onsets.\n","authors":["Zheng Chen","Yasuko Matsubara","Yasushi Sakurai","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2412.15598v2.pdf","comment":"Accepted at AAAI 2025"},{"id":"http://arxiv.org/abs/2402.18945v4","updated":"2025-03-03T06:34:48Z","published":"2024-02-29T08:20:49Z","title":"SynGhost: Invisible and Universal Task-agnostic Backdoor Attack via\n Syntactic Transfer","summary":" Although pre-training achieves remarkable performance, it suffers from\ntask-agnostic backdoor attacks due to vulnerabilities in data and training\nmechanisms. These attacks can transfer backdoors to various downstream tasks.\nIn this paper, we introduce $\\mathtt{maxEntropy}$, an entropy-based poisoning\nfilter that mitigates such risks. To overcome the limitations of manual target\nsetting and explicit triggers, we propose $\\mathtt{SynGhost}$, an invisible and\nuniversal task-agnostic backdoor attack via syntactic transfer, further\nexposing vulnerabilities in pre-trained language models (PLMs). Specifically,\n$\\mathtt{SynGhost}$ injects multiple syntactic backdoors into the pre-training\nspace through corpus poisoning, while preserving the PLM's pre-training\ncapabilities. Second, $\\mathtt{SynGhost}$ adaptively selects optimal targets\nbased on contrastive learning, creating a uniform distribution in the\npre-training space. To identify syntactic differences, we also introduce an\nawareness module to minimize interference between backdoors. Experiments show\nthat $\\mathtt{SynGhost}$ poses significant threats and can transfer to various\ndownstream tasks. Furthermore, $\\mathtt{SynGhost}$ resists defenses based on\nperplexity, fine-pruning, and $\\mathtt{maxEntropy}$. The code is available at\nhttps://github.com/Zhou-CyberSecurity-AI/SynGhost.\n","authors":["Pengzhou Cheng","Wei Du","Zongru Wu","Fengwei Zhang","Libo Chen","Zhuosheng Zhang","Gongshen Liu"],"pdf_url":"https://arxiv.org/pdf/2402.18945v4.pdf","comment":"17 pages, 16 figures, 12 tables, accepted at NAACL 2025 Findings"},{"id":"http://arxiv.org/abs/2412.19160v2","updated":"2025-03-03T06:34:25Z","published":"2024-12-26T10:40:15Z","title":"Cross-Spectral Vision Transformer for Biometric Authentication using\n Forehead Subcutaneous Vein Pattern and Periocular Pattern","summary":" Traditional biometric systems have encountered significant setbacks due to\nvarious unavoidable factors, for example, face recognition-based biometrics\nfails due to the wearing of face masks and fingerprints create hygiene\nconcerns. This paper proposes a novel lightweight cross-spectral vision\ntransformer (CS-ViT) for biometric authentication using forehead subcutaneous\nvein patterns and periocular patterns, offering a promising alternative to\ntraditional methods, capable of performing well even with the face masks and\nwithout any physical touch. The proposed framework comprises a cross-spectral\ndual-channel architecture designed to handle two distinct biometric traits and\nto capture inter-dependencies in terms of relative spectral patterns. Each\nchannel consists of a Phase-Only Correlation Cross-Spectral Attention (POC-CSA)\nthat captures their individual as well as correlated patterns. The computation\nof cross-spectral attention using POC extracts the phase correlation in the\nspatial features. Therefore, it is robust against the resolution/intensity\nvariations and illumination of the input images, assuming both biometric traits\nare from the same person. The lightweight model is suitable for edge device\ndeployment. The performance of the proposed algorithm was rigorously evaluated\nusing the Forehead Subcutaneous Vein Pattern and Periocular Biometric Pattern\n(FSVP-PBP) database. The results demonstrated the superiority of the algorithm\nover state-of-the-art methods, achieving a remarkable classification accuracy\nof 98.8% with the combined vein and periocular patterns.\n","authors":["Arun K. Sharma","Shubhobrata Bhattacharya","Motahar Reza","Bishakh Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2412.19160v2.pdf","comment":"Submitted to IEEE TPAMI"},{"id":"http://arxiv.org/abs/2502.17204v2","updated":"2025-03-03T06:29:31Z","published":"2025-02-24T14:39:28Z","title":"Order Matters: Investigate the Position Bias in Multi-constraint\n Instruction Following","summary":" Real-world instructions with multiple constraints pose a significant\nchallenge to existing large language models (LLMs). An observation is that the\nLLMs exhibit dramatic performance fluctuation when disturbing the order of the\nincorporated constraints. Yet, none of the existing works has systematically\ninvestigated this position bias problem in the field of multi-constraint\ninstruction following. To bridge this gap, we design a probing task where we\nquantitatively measure the difficulty distribution of the constraints by a\nnovel Difficulty Distribution Index (CDDI). Through the experimental results,\nwe find that LLMs are more performant when presented with the constraints in a\n``hard-to-easy'' order. This preference can be generalized to LLMs with\ndifferent architecture or different sizes of parameters. Additionally, we\nconduct an explanation study, providing an intuitive insight into the\ncorrelation between the LLM's attention and constraint orders. Our code and\ndataset are publicly available at https://github.com/meowpass/PBIF.\n","authors":["Jie Zeng","Qianyu He","Qingyu Ren","Jiaqing Liang","Yanghua Xiao","Weikang Zhou","Zeye Sun","Fei Yu"],"pdf_url":"https://arxiv.org/pdf/2502.17204v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01405v4","updated":"2025-03-03T06:14:14Z","published":"2023-10-02T17:59:07Z","title":"Representation Engineering: A Top-Down Approach to AI Transparency","summary":" In this paper, we identify and characterize the emerging area of\nrepresentation engineering (RepE), an approach to enhancing the transparency of\nAI systems that draws on insights from cognitive neuroscience. RepE places\npopulation-level representations, rather than neurons or circuits, at the\ncenter of analysis, equipping us with novel methods for monitoring and\nmanipulating high-level cognitive phenomena in deep neural networks (DNNs). We\nprovide baselines and an initial analysis of RepE techniques, showing that they\noffer simple yet effective solutions for improving our understanding and\ncontrol of large language models. We showcase how these methods can provide\ntraction on a wide range of safety-relevant problems, including honesty,\nharmlessness, power-seeking, and more, demonstrating the promise of top-down\ntransparency research. We hope that this work catalyzes further exploration of\nRepE and fosters advancements in the transparency and safety of AI systems.\n","authors":["Andy Zou","Long Phan","Sarah Chen","James Campbell","Phillip Guo","Richard Ren","Alexander Pan","Xuwang Yin","Mantas Mazeika","Ann-Kathrin Dombrowski","Shashwat Goel","Nathaniel Li","Michael J. Byun","Zifan Wang","Alex Mallen","Steven Basart","Sanmi Koyejo","Dawn Song","Matt Fredrikson","J. Zico Kolter","Dan Hendrycks"],"pdf_url":"https://arxiv.org/pdf/2310.01405v4.pdf","comment":"Code is available at\n https://github.com/andyzoujm/representation-engineering"},{"id":"http://arxiv.org/abs/2411.02886v2","updated":"2025-03-03T05:49:41Z","published":"2024-11-05T07:56:24Z","title":"TokenSelect: Efficient Long-Context Inference and Length Extrapolation\n for LLMs via Dynamic Token-Level KV Cache Selection","summary":" The rapid advancement of Large Language Models (LLMs) has driven growing\ndemand for processing extended context sequences in contemporary applications.\nHowever, this progress faces two major challenges: performance degradation due\nto sequence lengths out-of-distribution, and excessively long inference times\ncaused by the quadratic computational complexity of attention. These issues\nhinder the application of LLMs in long-context scenarios. In this paper, we\npropose Dynamic Token-Level KV Cache Selection (TokenSelect), a training-free\nmethod for efficient and accurate long-context inference. TokenSelect builds\nupon the observation of non-contiguous attention sparsity, using Query-Key dot\nproducts to measure per-head KV Cache criticality at token-level. By per-head\nsoft voting mechanism, TokenSelect selectively involves a few critical KV cache\ntokens in attention calculation without sacrificing accuracy. To further\naccelerate TokenSelect, we design the Selection Cache based on observations of\nconsecutive Query similarity and implemented efficient dot product kernel,\nsignificantly reducing the overhead. A comprehensive evaluation of TokenSelect\ndemonstrates up to 23.84x speedup in attention computation and up to 2.28x\nacceleration in end-to-end latency, while providing superior performance\ncompared to state-of-the-art long-context inference methods.\n","authors":["Wei Wu","Zhuoshi Pan","Chao Wang","Liyi Chen","Yunchu Bai","Tianfu Wang","Kun Fu","Zheng Wang","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2411.02886v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02268v3","updated":"2025-03-03T05:32:47Z","published":"2024-10-03T07:40:14Z","title":"Structural-Entropy-Based Sample Selection for Efficient and Effective\n Learning","summary":" Sample selection improves the efficiency and effectiveness of machine\nlearning models by providing informative and representative samples. Typically,\nsamples can be modeled as a sample graph, where nodes are samples and edges\nrepresent their similarities. Most existing methods are based on local\ninformation, such as the training difficulty of samples, thereby overlooking\nglobal information, such as connectivity patterns. This oversight can result in\nsuboptimal selection because global information is crucial for ensuring that\nthe selected samples well represent the structural properties of the graph. To\naddress this issue, we employ structural entropy to quantify global information\nand losslessly decompose it from the whole graph to individual nodes using the\nShapley value. Based on the decomposition, we present\n$\\textbf{S}$tructural-$\\textbf{E}$ntropy-based sample $\\textbf{S}$election\n($\\textbf{SES}$), a method that integrates both global and local information to\nselect informative and representative samples. SES begins by constructing a\n$k$NN-graph among samples based on their similarities. It then measures sample\nimportance by combining structural entropy (global metric) with training\ndifficulty (local metric). Finally, SES applies importance-biased blue noise\nsampling to select a set of diverse and representative samples. Comprehensive\nexperiments on three learning scenarios -- supervised learning, active\nlearning, and continual learning -- clearly demonstrate the effectiveness of\nour method.\n","authors":["Tianchi Xie","Jiangning Zhu","Guozu Ma","Minzhi Lin","Wei Chen","Weikai Yang","Shixia Liu"],"pdf_url":"https://arxiv.org/pdf/2410.02268v3.pdf","comment":"Published as a conference paper at ICLR 2025"},{"id":"http://arxiv.org/abs/2502.01912v2","updated":"2025-03-03T05:25:43Z","published":"2025-02-04T01:05:12Z","title":"PATCH: a deep learning method to assess heterogeneity of artistic\n practice in historical paintings","summary":" The history of art has seen significant shifts in the manner in which\nartworks are created, making understanding of creative processes a central\nquestion in technical art history. In the Renaissance and Early Modern period,\npaintings were largely produced by master painters directing workshops of\napprentices who often contributed to projects. The masters varied significantly\nin artistic and managerial styles, meaning different combinations of artists\nand implements might be seen both between masters and within workshops or even\nindividual canvases. Information on how different workshops were managed and\nthe processes by which artworks were created remains elusive. Machine learning\nmethods have potential to unearth new information about artists' creative\nprocesses by extending the analysis of brushwork to a microscopic scale.\nAnalysis of workshop paintings, however, presents a challenge in that\ndocumentation of the artists and materials involved is sparse, meaning external\nexamples are not available to train networks to recognize their contributions.\nHere we present a novel machine learning approach we call pairwise assignment\ntraining for classifying heterogeneity (PATCH) that is capable of identifying\nindividual artistic practice regimes with no external training data, or \"ground\ntruth.\" The method achieves unsupervised results by supervised means, and\noutperforms both simple statistical procedures and unsupervised machine\nlearning methods. We apply this method to two historical paintings by the\nSpanish Renaissance master, El Greco: The Baptism of Christ and Christ on the\nCross with Landscape, and our findings regarding the former potentially\nchallenge previous work that has assigned the painting to workshop members.\nFurther, the results of our analyses create a measure of heterogeneity of\nartistic practice that can be used to characterize artworks across time and\nspace.\n","authors":["Andrew Van Horn","Lauryn Smith","Mahamad Mahmoud","Michael McMaster","Clara Pinchbeck","Ina Martin","Andrew Lininger","Anthony Ingrisano","Adam Lowe","Carlos Bayod","Elizabeth Bolman","Kenneth Singer","Michael Hinczewski"],"pdf_url":"https://arxiv.org/pdf/2502.01912v2.pdf","comment":"main text: 16 pages, 6 figures; SI: 7 pages, 3 figures; v2: minor\n typo corrections, higher resolution figures"},{"id":"http://arxiv.org/abs/2502.17720v2","updated":"2025-03-03T04:31:48Z","published":"2025-02-24T23:23:27Z","title":"Spontaneous Giving and Calculated Greed in Language Models","summary":" Large language models demonstrate advanced problem-solving capabilities by\nincorporating reasoning techniques such as chain of thought and reflection.\nHowever, how these reasoning capabilities extend to social intelligence remains\nunclear. In this study, we investigate this question using economic games that\nmodel social dilemmas, where social intelligence plays a crucial role. First,\nwe examine the effects of chain-of-thought and reflection techniques in a\npublic goods game. We then extend our analysis to six economic games on\ncooperation and punishment, comparing off-the-shelf non-reasoning and reasoning\nmodels. We find that reasoning models significantly reduce cooperation and norm\nenforcement, prioritizing individual rationality. Consequently, groups with\nmore reasoning models exhibit less cooperation and lower gains through repeated\ninteractions. These behaviors parallel human tendencies of \"spontaneous giving\nand calculated greed.\" Our results suggest the need for AI architectures that\nincorporate social intelligence alongside reasoning capabilities to ensure that\nAI supports, rather than disrupts, human cooperative intuition.\n","authors":["Yuxuan Li","Hirokazu Shirado"],"pdf_url":"https://arxiv.org/pdf/2502.17720v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09906v3","updated":"2025-03-03T04:28:49Z","published":"2024-02-15T12:12:19Z","title":"Generative Representational Instruction Tuning","summary":" All text-based language problems can be reduced to either generation or\nembedding. Current models only perform well at one or the other. We introduce\ngenerative representational instruction tuning (GRIT) whereby a large language\nmodel is trained to handle both generative and embedding tasks by\ndistinguishing between them through instructions. Compared to other open\nmodels, our resulting GritLM 7B sets a new state of the art on the Massive Text\nEmbedding Benchmark (MTEB) and outperforms all models up to its size on a range\nof generative tasks. By scaling up further, GritLM 8x7B outperforms all open\ngenerative language models that we tried while still being among the best\nembedding models. Notably, we find that GRIT matches training on only\ngenerative or embedding data, thus we can unify both at no performance loss.\nAmong other benefits, the unification via GRIT speeds up Retrieval-Augmented\nGeneration (RAG) by > 60% for long documents, by no longer requiring separate\nretrieval and generation models. Models, code, etc. are freely available at\nhttps://github.com/ContextualAI/gritlm.\n","authors":["Niklas Muennighoff","Hongjin Su","Liang Wang","Nan Yang","Furu Wei","Tao Yu","Amanpreet Singh","Douwe Kiela"],"pdf_url":"https://arxiv.org/pdf/2402.09906v3.pdf","comment":"67 pages (16 main), 25 figures, 34 tables"},{"id":"http://arxiv.org/abs/2410.08892v2","updated":"2025-03-03T04:14:17Z","published":"2024-10-11T15:10:38Z","title":"Federated Learning in Practice: Reflections and Projections","summary":" Federated Learning (FL) is a machine learning technique that enables multiple\nentities to collaboratively learn a shared model without exchanging their local\ndata. Over the past decade, FL systems have achieved substantial progress,\nscaling to millions of devices across various learning domains while offering\nmeaningful differential privacy (DP) guarantees. Production systems from\norganizations like Google, Apple, and Meta demonstrate the real-world\napplicability of FL. However, key challenges remain, including verifying\nserver-side DP guarantees and coordinating training across heterogeneous\ndevices, limiting broader adoption. Additionally, emerging trends such as large\n(multi-modal) models and blurred lines between training, inference, and\npersonalization challenge traditional FL frameworks. In response, we propose a\nredefined FL framework that prioritizes privacy principles rather than rigid\ndefinitions. We also chart a path forward by leveraging trusted execution\nenvironments and open-source ecosystems to address these challenges and\nfacilitate future advancements in FL.\n","authors":["Katharine Daly","Hubert Eichner","Peter Kairouz","H. Brendan McMahan","Daniel Ramage","Zheng Xu"],"pdf_url":"https://arxiv.org/pdf/2410.08892v2.pdf","comment":"Published at 2024 IEEE 6th International Conference on Trust, Privacy\n and Security in Intelligent Systems, and Applications (TPS-ISA)"},{"id":"http://arxiv.org/abs/2502.20808v2","updated":"2025-03-03T03:43:03Z","published":"2025-02-28T07:50:36Z","title":"MV-MATH: Evaluating Multimodal Math Reasoning in Multi-Visual Contexts","summary":" Multimodal Large Language Models (MLLMs) have shown promising capabilities in\nmathematical reasoning within visual contexts across various datasets. However,\nmost existing multimodal math benchmarks are limited to single-visual contexts,\nwhich diverges from the multi-visual scenarios commonly encountered in\nreal-world mathematical applications. To address this gap, we introduce\nMV-MATH: a meticulously curated dataset of 2,009 high-quality mathematical\nproblems. Each problem integrates multiple images interleaved with text,\nderived from authentic K-12 scenarios, and enriched with detailed annotations.\nMV-MATH includes multiple-choice, free-form, and multi-step questions, covering\n11 subject areas across 3 difficulty levels, and serves as a comprehensive and\nrigorous benchmark for assessing MLLMs' mathematical reasoning in multi-visual\ncontexts. Through extensive experimentation, we observe that MLLMs encounter\nsubstantial challenges in multi-visual math tasks, with a considerable\nperformance gap relative to human capabilities on MV-MATH. Furthermore, we\nanalyze the performance and error patterns of various models, providing\ninsights into MLLMs' mathematical reasoning capabilities within multi-visual\nsettings.\n","authors":["Peijie Wang","Zhongzhi Li","Fei Yin","Dekang Ran","Chenglin Liu"],"pdf_url":"https://arxiv.org/pdf/2502.20808v2.pdf","comment":"47 pages"},{"id":"http://arxiv.org/abs/2407.00617v4","updated":"2025-03-03T03:41:11Z","published":"2024-06-30T08:00:34Z","title":"Iterative Nash Policy Optimization: Aligning LLMs with General\n Preferences via No-Regret Learning","summary":" Reinforcement Learning with Human Feedback (RLHF) has achieved great success\nin aligning large language models (LLMs) with human preferences. Prevalent RLHF\napproaches are reward-based, following the Bradley-Terry (BT) model assumption,\nwhich may not fully capture the complexity of human preferences. In this paper,\nwe explore RLHF under a general preference framework and approach it from a\ngame-theoretic perspective. Specifically, we formulate the problem as a\ntwo-player game and propose a novel online algorithm, iterative Nash policy\noptimization (INPO). The key idea is to let the policy play against itself via\nno-regret learning, thereby approximating the Nash policy. Unlike previous\nmethods, INPO bypasses the need for estimating the expected win rate for\nindividual responses, which typically incurs high computational or annotation\ncosts. Instead, we introduce a new loss objective that is directly minimized\nover a preference dataset. We provide theoretical analysis for our approach and\ndemonstrate its effectiveness through experiments on various representative\nbenchmarks. With an LLaMA-3-8B-based SFT model, INPO achieves a 42.6%\nlength-controlled win rate on AlpacaEval 2.0 and a 37.8% win rate on\nArena-Hard, showing substantial improvement over the state-of-the-art online\nRLHF algorithms.\n","authors":["Yuheng Zhang","Dian Yu","Baolin Peng","Linfeng Song","Ye Tian","Mingyue Huo","Nan Jiang","Haitao Mi","Dong Yu"],"pdf_url":"https://arxiv.org/pdf/2407.00617v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04405v3","updated":"2025-03-03T03:39:50Z","published":"2024-07-05T10:41:15Z","title":"Discovering physical laws with parallel combinatorial tree search","summary":" Symbolic regression plays a crucial role in modern scientific research thanks\nto its capability of discovering concise and interpretable mathematical\nexpressions from data. A grand challenge lies in the arduous search for\nparsimonious and generalizable mathematical formulas, in an infinite search\nspace, while intending to fit the training data. Existing algorithms have faced\na critical bottleneck of accuracy and efficiency over a decade when handling\nproblems of complexity, which essentially hinders the pace of applying symbolic\nregression for scientific exploration across interdisciplinary domains. To this\nend, we introduce a parallel combinatorial tree search (PCTS) model to\nefficiently distill generic mathematical expressions from limited data. Through\na series of extensive experiments, we demonstrate the superior accuracy and\nefficiency of PCTS for equation discovery, which greatly outperforms the\nstate-of-the-art baseline models on over 200 synthetic and experimental\ndatasets (e.g., lifting its performance by up to 99% accuracy improvement and\none-order of magnitude speed up). PCTS represents a key advance in accurate and\nefficient data-driven discovery of symbolic, interpretable models (e.g.,\nunderlying physical laws) and marks a pivotal transition towards scalable\nsymbolic learning.\n","authors":["Kai Ruan","Yilong Xu","Ze-Feng Gao","Yike Guo","Hao Sun","Ji-Rong Wen","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2407.04405v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17710v4","updated":"2025-03-03T03:36:17Z","published":"2024-03-26T13:58:00Z","title":"Optimization-based Prompt Injection Attack to LLM-as-a-Judge","summary":" LLM-as-a-Judge uses a large language model (LLM) to select the best response\nfrom a set of candidates for a given question. LLM-as-a-Judge has many\napplications such as LLM-powered search, reinforcement learning with AI\nfeedback (RLAIF), and tool selection. In this work, we propose JudgeDeceiver,\nan optimization-based prompt injection attack to LLM-as-a-Judge. JudgeDeceiver\ninjects a carefully crafted sequence into an attacker-controlled candidate\nresponse such that LLM-as-a-Judge selects the candidate response for an\nattacker-chosen question no matter what other candidate responses are.\nSpecifically, we formulate finding such sequence as an optimization problem and\npropose a gradient based method to approximately solve it. Our extensive\nevaluation shows that JudgeDeceive is highly effective, and is much more\neffective than existing prompt injection attacks that manually craft the\ninjected sequences and jailbreak attacks when extended to our problem. We also\nshow the effectiveness of JudgeDeceiver in three case studies, i.e.,\nLLM-powered search, RLAIF, and tool selection. Moreover, we consider defenses\nincluding known-answer detection, perplexity detection, and perplexity windowed\ndetection. Our results show these defenses are insufficient, highlighting the\nurgent need for developing new defense strategies. Our implementation is\navailable at this repository: https://github.com/ShiJiawenwen/JudgeDeceiver.\n","authors":["Jiawen Shi","Zenghui Yuan","Yinuo Liu","Yue Huang","Pan Zhou","Lichao Sun","Neil Zhenqiang Gong"],"pdf_url":"https://arxiv.org/pdf/2403.17710v4.pdf","comment":"To appear in the Proceedings of The ACM Conference on Computer and\n Communications Security (CCS), 2024"},{"id":"http://arxiv.org/abs/2502.01117v2","updated":"2025-03-03T03:35:00Z","published":"2025-02-03T07:13:59Z","title":"Learning to Learn Weight Generation via Trajectory Diffusion","summary":" Diffusion-based algorithms have emerged as promising techniques for weight\ngeneration, particularly in scenarios like multi-task learning that require\nfrequent weight updates. However, existing solutions suffer from limited\ncross-task transferability. In addition, they only utilize optimal weights as\ntraining samples, ignoring the value of other weights in the optimization\nprocess. To address these issues, we propose Lt-Di, which integrates the\ndiffusion algorithm with meta-learning to generate weights for unseen tasks.\nFurthermore, we extend the vanilla diffusion algorithm into a trajectory\ndiffusion algorithm to utilize other weights along the optimization trajectory.\nTrajectory diffusion decomposes the entire diffusion chain into multiple\nshorter ones, improving training and inference efficiency. We analyze the\nconvergence properties of the weight generation paradigm and improve\nconvergence efficiency without additional time overhead. Our experiments\ndemonstrate Lt-Di's higher accuracy while reducing computational overhead\nacross various tasks, including zero-shot and few-shot learning, multi-domain\ngeneralization, and large-scale language model fine-tuning.Our code is released\nat https://anonymous.4open.science/r/Lt-Di-0E51.\n","authors":["Yunchuan Guan","Yu Liu","Ke Zhou","Zhiqi Shen","Serge Belongie","Jenq-Neng Hwang","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2502.01117v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21256v2","updated":"2025-03-03T03:23:44Z","published":"2024-10-28T17:54:29Z","title":"Multi-modal AI for comprehensive breast cancer prognostication","summary":" Treatment selection in breast cancer is guided by molecular subtypes and\nclinical characteristics. However, current tools including genomic assays lack\nthe accuracy required for optimal clinical decision-making. We developed a\nnovel artificial intelligence (AI)-based approach that integrates digital\npathology images with clinical data, providing a more robust and effective\nmethod for predicting the risk of cancer recurrence in breast cancer patients.\nSpecifically, we utilized a vision transformer pan-cancer foundation model\ntrained with self-supervised learning to extract features from digitized\nH&E-stained slides. These features were integrated with clinical data to form a\nmulti-modal AI test predicting cancer recurrence and death. The test was\ndeveloped and evaluated using data from a total of 8,161 female breast cancer\npatients across 15 cohorts originating from seven countries. Of these, 3,502\npatients from five cohorts were used exclusively for evaluation, while the\nremaining patients were used for training. Our test accurately predicted our\nprimary endpoint, disease-free interval, in the five evaluation cohorts\n(C-index: 0.71 [0.68-0.75], HR: 3.63 [3.02-4.37, p<0.001]). In a direct\ncomparison (n=858), the AI test was more accurate than Oncotype DX, the\nstandard-of-care 21-gene assay, achieving a C-index of 0.67 [0.61-0.74] versus\n0.61 [0.49-0.73], respectively. Additionally, the AI test added independent\nprognostic information to Oncotype DX in a multivariate analysis (HR: 3.11\n[1.91-5.09, p<0.001)]). The test demonstrated robust accuracy across major\nmolecular breast cancer subtypes, including TNBC (C-index: 0.71 [0.62-0.81],\nHR: 3.81 [2.35-6.17, p=0.02]), where no diagnostic tools are currently\nrecommended by clinical guidelines. These results suggest that our AI test\nimproves upon the accuracy of existing prognostic tests, while being applicable\nto a wider range of patients.\n","authors":["Jan Witowski","Ken G. Zeng","Joseph Cappadona","Jailan Elayoubi","Khalil Choucair","Elena Diana Chiru","Nancy Chan","Young-Joon Kang","Frederick Howard","Irina Ostrovnaya","Carlos Fernandez-Granda","Freya Schnabel","Zoe Steinsnyder","Ugur Ozerdem","Kangning Liu","Waleed Abdulsattar","Yu Zong","Lina Daoud","Rafic Beydoun","Anas Saad","Nitya Thakore","Mohammad Sadic","Frank Yeung","Elisa Liu","Theodore Hill","Benjamin Swett","Danielle Rigau","Andrew Clayburn","Valerie Speirs","Marcus Vetter","Lina Sojak","Simone Soysal","Daniel Baumhoer","Jia-Wern Pan","Haslina Makmur","Soo-Hwang Teo","Linda Ma Pak","Victor Angel","Dovile Zilenaite-Petrulaitiene","Arvydas Laurinavicius","Natalie Klar","Brian D. Piening","Carlo Bifulco","Sun-Young Jun","Jae Pak Yi","Su Hyun Lim","Adam Brufsky","Francisco J. Esteva","Lajos Pusztai","Yann LeCun","Krzysztof J. Geras"],"pdf_url":"https://arxiv.org/pdf/2410.21256v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13213v2","updated":"2025-03-03T03:20:08Z","published":"2024-10-17T04:37:37Z","title":"LLMOPT: Learning to Define and Solve General Optimization Problems from\n Scratch","summary":" Optimization problems are prevalent across various scenarios. Formulating and\nthen solving optimization problems described by natural language often requires\nhighly specialized human expertise, which could block the widespread\napplication of optimization-based decision making. To automate problem\nformulation and solving, leveraging large language models (LLMs) has emerged as\na potential way. However, this kind of approach suffers from the issue of\noptimization generalization. Namely, the accuracy of most current LLM-based\nmethods and the generality of optimization problem types that they can model\nare still limited. In this paper, we propose a unified learning-based framework\ncalled LLMOPT to boost optimization generalization. Starting from the natural\nlanguage descriptions of optimization problems and a pre-trained LLM, LLMOPT\nconstructs the introduced five-element formulation as a universal model for\nlearning to define diverse optimization problem types. Then, LLMOPT employs the\nmulti-instruction tuning to enhance both problem formalization and solver code\ngeneration accuracy and generality. After that, to prevent hallucinations in\nLLMs, such as sacrificing solving accuracy to avoid execution errors, the model\nalignment and self-correction mechanism are adopted in LLMOPT. We evaluate the\noptimization generalization ability of LLMOPT and compared methods across six\nreal-world datasets covering roughly 20 fields such as health, environment,\nenergy and manufacturing, etc. Extensive experiment results show that LLMOPT is\nable to model various optimization problem types such as linear/nonlinear\nprogramming, mixed integer programming, and combinatorial optimization, and\nachieves a notable 11.08% average solving accuracy improvement compared with\nthe state-of-the-art methods. The code is available at\nhttps://github.com/caigaojiang/LLMOPT.\n","authors":["Caigao Jiang","Xiang Shu","Hong Qian","Xingyu Lu","Jun Zhou","Aimin Zhou","Yang Yu"],"pdf_url":"https://arxiv.org/pdf/2410.13213v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12844v2","updated":"2025-03-03T03:18:40Z","published":"2025-01-22T12:45:09Z","title":"GAMED-Snake: Gradient-aware Adaptive Momentum Evolution Deep Snake Model\n for Multi-organ Segmentation","summary":" Multi-organ segmentation is a critical yet challenging task due to complex\nanatomical backgrounds, blurred boundaries, and diverse morphologies. This\nstudy introduces the Gradient-aware Adaptive Momentum Evolution Deep Snake\n(GAMED-Snake) model, which establishes a novel paradigm for contour-based\nsegmentation by integrating gradient-based learning with adaptive momentum\nevolution mechanisms. The GAMED-Snake model incorporates three major\ninnovations: First, the Distance Energy Map Prior (DEMP) generates a\npixel-level force field that effectively attracts contour points towards the\ntrue boundaries, even in scenarios with complex backgrounds and blurred edges.\nSecond, the Differential Convolution Inception Module (DCIM) precisely extracts\ncomprehensive energy gradients, significantly enhancing segmentation accuracy.\nThird, the Adaptive Momentum Evolution Mechanism (AMEM) employs cross-attention\nto establish dynamic features across different iterations of evolution,\nenabling precise boundary alignment for diverse morphologies. Experimental\nresults on four challenging multi-organ segmentation datasets demonstrate that\nGAMED-Snake improves the mDice metric by approximately 2% compared to\nstate-of-the-art methods. Code will be available at\nhttps://github.com/SYSUzrc/GAMED-Snake.\n","authors":["Ruicheng Zhang","Haowei Guo","Zeyu Zhang","Puxin Yan","Shen Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.12844v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.17242v3","updated":"2025-03-03T03:08:43Z","published":"2024-12-23T03:30:34Z","title":"On the Generalization and Adaptation Ability of Machine-Generated Text\n Detectors in Academic Writing","summary":" The rising popularity of large language models (LLMs) has raised concerns\nabout machine-generated text (MGT), particularly in academic settings, where\nissues like plagiarism and misinformation are prevalent. As a result,\ndeveloping a highly generalizable and adaptable MGT detection system has become\nan urgent priority. Given that LLMs are most commonly misused in academic\nwriting, this work investigates the generalization and adaptation capabilities\nof MGT detectors in three key aspects specific to academic writing: First, we\nconstruct MGT-Acedemic, a large-scale dataset comprising over 336M tokens and\n749K samples. MGT-Acedemic focuses on academic writing, featuring human-written\ntexts (HWTs) and MGTs across STEM, Humanities, and Social Sciences, paired with\nan extensible code framework for efficient benchmarking. Second, we benchmark\nthe performance of various detectors for binary classification and attribution\ntasks in both in-domain and cross-domain settings. This benchmark reveals the\noften-overlooked challenges of attribution tasks. Third, we introduce a novel\nattribution task where models have to adapt to new classes over time without\n(or with very limited) access to prior training data in both few-shot and\nmany-shot scenarios. We implement eight different adapting techniques to\nimprove the performance and highlight the inherent complexity of the task. Our\nfindings provide insights into the generalization and adaptation ability of MGT\ndetectors across diverse scenarios and lay the foundation for building robust,\nadaptive detection systems. The code framework is available at\nhttps://github.com/Y-L-LIU/MGTBench-2.0.\n","authors":["Yule Liu","Zhiyuan Zhong","Yifan Liao","Zhen Sun","Jingyi Zheng","Jiaheng Wei","Qingyuan Gong","Fenghua Tong","Yang Chen","Yang Zhang","Xinlei He"],"pdf_url":"https://arxiv.org/pdf/2412.17242v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06600v3","updated":"2025-03-03T03:05:30Z","published":"2024-06-06T13:44:57Z","title":"HORAE: A Domain-Agnostic Modeling Language for Automating Multimodal\n Service Regulation","summary":" Artificial intelligence is rapidly encroaching on the field of service\nregulation. This work-in-progress article presents the design principles behind\nHORAE, a unified specification language to model multimodal regulation rules\nacross a diverse set of domains. We show how HORAE facilitates an intelligent\nservice regulation pipeline by further exploiting a fine-tuned large language\nmodel named HORAE that automates the HORAE modeling process, thereby yielding\nan end-to-end framework for fully automated intelligent service regulation.\n","authors":["Yutao Sun","Mingshuai Chen","Kangjia Zhao","Jintao Chen"],"pdf_url":"https://arxiv.org/pdf/2406.06600v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07404v2","updated":"2025-03-03T03:02:55Z","published":"2024-11-11T22:22:21Z","title":"Controllable Context Sensitivity and the Knob Behind It","summary":" When making predictions, a language model must trade off how much it relies\non its context vs. its prior knowledge. Choosing how sensitive the model is to\nits context is a fundamental functionality, as it enables the model to excel at\ntasks like retrieval-augmented generation and question-answering. In this\npaper, we search for a knob which controls this sensitivity, determining\nwhether language models answer from the context or their prior knowledge. To\nguide this search, we design a task for controllable context sensitivity. In\nthis task, we first feed the model a context (Paris is in England) and a\nquestion (Where is Paris?); we then instruct the model to either use its prior\nor contextual knowledge and evaluate whether it generates the correct answer\nfor both intents (either France or England). When fine-tuned on this task,\ninstruction-tuned versions of Llama-3.1, Mistral-v0.3, and Gemma-2 can solve it\nwith high accuracy (85-95%). Analyzing these high-performing models, we narrow\ndown which layers may be important to context sensitivity using a novel linear\ntime algorithm. Then, in each model, we identify a 1-D subspace in a single\nlayer that encodes whether the model follows context or prior knowledge.\nInterestingly, while we identify this subspace in a fine-tuned model, we find\nthat the exact same subspace serves as an effective knob in not only that model\nbut also non-fine-tuned instruct and base models of that model family. Finally,\nwe show a strong correlation between a model's performance and how distinctly\nit separates context-agreeing from context-ignoring answers in this subspace.\nThese results suggest a single subspace facilitates how the model chooses\nbetween context and prior knowledge, hinting at a simple fundamental mechanism\nthat controls this behavior.\n","authors":["Julian Minder","Kevin Du","Niklas Stoehr","Giovanni Monea","Chris Wendler","Robert West","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2411.07404v2.pdf","comment":"Published as a conference paper at ICLR 2025"},{"id":"http://arxiv.org/abs/2502.20854v2","updated":"2025-03-03T03:00:59Z","published":"2025-02-28T08:53:08Z","title":"A Pilot Empirical Study on When and How to Use Knowledge Graphs as\n Retrieval Augmented Generation","summary":" The integration of Knowledge Graphs (KGs) into the Retrieval Augmented\nGeneration (RAG) framework has attracted significant interest, with early\nstudies showing promise in mitigating hallucinations and improving model\naccuracy. However, a systematic understanding and comparative analysis of the\nrapidly emerging KG-RAG methods are still lacking. This paper seeks to lay the\nfoundation for systematically answering the question of when and how to use\nKG-RAG by analyzing their performance in various application scenarios\nassociated with different technical configurations. After outlining the mind\nmap using KG-RAG framework and summarizing its popular pipeline, we conduct a\npilot empirical study of KG-RAG works to reimplement and evaluate 6 KG-RAG\nmethods across 7 datasets in diverse scenarios, analyzing the impact of 9\nKG-RAG configurations in combination with 17 LLMs. Our results underscore the\ncritical role of appropriate application conditions and optimal configurations\nof KG-RAG components.\n","authors":["Xujie Yuan","Yongxu Liu","Shimin Di","Shiwen Wu","Libin Zheng","Rui Meng","Lei Chen","Xiaofang Zhou","Jian Yin"],"pdf_url":"https://arxiv.org/pdf/2502.20854v2.pdf","comment":"8 pages, 2 figures, 14 tables"},{"id":"http://arxiv.org/abs/2410.00564v3","updated":"2025-03-03T02:59:29Z","published":"2024-10-01T10:25:03Z","title":"Scaling Offline Model-Based RL via Jointly-Optimized World-Action Model\n Pretraining","summary":" A significant aspiration of offline reinforcement learning (RL) is to develop\na generalist agent with high capabilities from large and heterogeneous\ndatasets. However, prior approaches that scale offline RL either rely heavily\non expert trajectories or struggle to generalize to diverse unseen tasks.\nInspired by the excellent generalization of world model in conditional video\ngeneration, we explore the potential of image observation-based world model for\nscaling offline RL and enhancing generalization on novel tasks. In this paper,\nwe introduce JOWA: Jointly-Optimized World-Action model, an offline model-based\nRL agent pretrained on multiple Atari games with 6 billion tokens data to learn\ngeneral-purpose representation and decision-making ability. Our method jointly\noptimizes a world-action model through a shared transformer backbone, which\nstabilize temporal difference learning with large models during pretraining.\nMoreover, we propose a provably efficient and parallelizable planning algorithm\nto compensate for the Q-value estimation error and thus search out better\npolicies. Experimental results indicate that our largest agent, with 150\nmillion parameters, achieves 78.9% human-level performance on pretrained games\nusing only 10% subsampled offline data, outperforming existing state-of-the-art\nlarge-scale offline RL baselines by 31.6% on averange. Furthermore, JOWA scales\nfavorably with model capacity and can sample-efficiently transfer to novel\ngames using only 5k offline fine-tuning data (approximately 4 trajectories) per\ngame, demonstrating superior generalization. We will release codes and model\nweights at https://github.com/CJReinforce/JOWA\n","authors":["Jie Cheng","Ruixi Qiao","Yingwei Ma","Binhua Li","Gang Xiong","Qinghai Miao","Yongbin Li","Yisheng Lv"],"pdf_url":"https://arxiv.org/pdf/2410.00564v3.pdf","comment":"Accepted by ICLR 2025"},{"id":"http://arxiv.org/abs/2410.01337v3","updated":"2025-03-03T02:50:30Z","published":"2024-10-02T08:54:18Z","title":"PhyMPGN: Physics-encoded Message Passing Graph Network for\n spatiotemporal PDE systems","summary":" Solving partial differential equations (PDEs) serves as a cornerstone for\nmodeling complex dynamical systems. Recent progresses have demonstrated grand\nbenefits of data-driven neural-based models for predicting spatiotemporal\ndynamics (e.g., tremendous speedup gain compared with classical numerical\nmethods). However, most existing neural models rely on rich training data, have\nlimited extrapolation and generalization abilities, and suffer to produce\nprecise or reliable physical prediction under intricate conditions (e.g.,\nirregular mesh or geometry, complex boundary conditions, diverse PDE\nparameters, etc.). To this end, we propose a new graph learning approach,\nnamely, Physics-encoded Message Passing Graph Network (PhyMPGN), to model\nspatiotemporal PDE systems on irregular meshes given small training datasets.\nSpecifically, we incorporate a GNN into a numerical integrator to approximate\nthe temporal marching of spatiotemporal dynamics for a given PDE system.\nConsidering that many physical phenomena are governed by diffusion processes,\nwe further design a learnable Laplace block, which encodes the discrete\nLaplace-Beltrami operator, to aid and guide the GNN learning in a physically\nfeasible solution space. A boundary condition padding strategy is also designed\nto improve the model convergence and accuracy. Extensive experiments\ndemonstrate that PhyMPGN is capable of accurately predicting various types of\nspatiotemporal dynamics on coarse unstructured meshes, consistently achieves\nthe state-of-the-art results, and outperforms other baselines with considerable\ngains.\n","authors":["Bocheng Zeng","Qi Wang","Mengtao Yan","Yang Liu","Ruizhi Chengze","Yi Zhang","Hongsheng Liu","Zidong Wang","Hao Sun"],"pdf_url":"https://arxiv.org/pdf/2410.01337v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.08109v3","updated":"2025-03-03T02:45:58Z","published":"2024-10-10T16:56:05Z","title":"A Closer Look at Machine Unlearning for Large Language Models","summary":" Large language models (LLMs) may memorize sensitive or copyrighted content,\nraising privacy and legal concerns. Due to the high cost of retraining from\nscratch, researchers attempt to employ machine unlearning to remove specific\ncontent from LLMs while preserving the overall performance. In this paper, we\ndiscuss several issues in machine unlearning for LLMs and provide our insights\non possible approaches. To address the issue of inadequate evaluation of model\noutputs after unlearning, we introduce three additional metrics to evaluate\ntoken diversity, sentence semantics, and factual correctness. We then\ncategorize unlearning methods into untargeted and targeted, and discuss their\nissues respectively. Specifically, the behavior that untargeted unlearning\nattempts to approximate is unpredictable and may involve hallucinations, and\nexisting regularization is insufficient for targeted unlearning. To alleviate\nthese issues, we propose using the objective of maximizing entropy (ME) for\nuntargeted unlearning and incorporate answer preservation (AP) loss as\nregularization for targeted unlearning. Experimental results across three\nscenarios, i.e., fictitious unlearning, continual unlearning, and real-world\nunlearning, demonstrate the effectiveness of our approaches. The code is\navailable at https://github.com/sail-sg/closer-look-LLM-unlearning.\n","authors":["Xiaojian Yuan","Tianyu Pang","Chao Du","Kejiang Chen","Weiming Zhang","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2410.08109v3.pdf","comment":"ICLR 2025"},{"id":"http://arxiv.org/abs/2502.21186v2","updated":"2025-03-03T02:33:31Z","published":"2025-02-28T16:02:23Z","title":"Scalable Decision-Making in Stochastic Environments through Learned\n Temporal Abstraction","summary":" Sequential decision-making in high-dimensional continuous action spaces,\nparticularly in stochastic environments, faces significant computational\nchallenges. We explore this challenge in the traditional offline RL setting,\nwhere an agent must learn how to make decisions based on data collected through\na stochastic behavior policy. We present Latent Macro Action Planner (L-MAP),\nwhich addresses this challenge by learning a set of temporally extended\nmacro-actions through a state-conditional Vector Quantized Variational\nAutoencoder (VQ-VAE), effectively reducing action dimensionality. L-MAP employs\na (separate) learned prior model that acts as a latent transition model and\nallows efficient sampling of plausible actions. During planning, our approach\naccounts for stochasticity in both the environment and the behavior policy by\nusing Monte Carlo tree search (MCTS). In offline RL settings, including\nstochastic continuous control tasks, L-MAP efficiently searches over discrete\nlatent actions to yield high expected returns. Empirical results demonstrate\nthat L-MAP maintains low decision latency despite increased action\ndimensionality. Notably, across tasks ranging from continuous control with\ninherently stochastic dynamics to high-dimensional robotic hand manipulation,\nL-MAP significantly outperforms existing model-based methods and performs\non-par with strong model-free actor-critic baselines, highlighting the\neffectiveness of the proposed approach in planning in complex and stochastic\nenvironments with high-dimensional action spaces.\n","authors":["Baiting Luo","Ava Pettet","Aron Laszka","Abhishek Dubey","Ayan Mukhopadhyay"],"pdf_url":"https://arxiv.org/pdf/2502.21186v2.pdf","comment":"Accepted by ICLR2025. Code would be available at\n https://github.com/BaitingLuo/L-MAP.git"},{"id":"http://arxiv.org/abs/2501.13983v3","updated":"2025-03-03T02:06:47Z","published":"2025-01-23T06:57:24Z","title":"AdEval: Alignment-based Dynamic Evaluation to Mitigate Data\n Contamination in Large Language Models","summary":" As Large Language Models (LLMs) are pretrained on massive-scale corpora, the\nissue of data contamination has become increasingly severe, leading to\npotential overestimation of model performance during evaluation. To address\nthis, we propose AdEval (Alignment-based Dynamic Evaluation), a dynamic data\nevaluation method aimed at mitigating the impact of data contamination on\nevaluation reliability. Experimental results on multiple datasets demonstrate\nthat AdEval effectively reduces the impact of data contamination on evaluation\noutcomes, enhancing both the fairness and reliability of the evaluation\nprocess.\n","authors":["Yang Fan"],"pdf_url":"https://arxiv.org/pdf/2501.13983v3.pdf","comment":"There are serious academic problems in this paper, such as data\n falsification and plagiarism in the method of the paper"},{"id":"http://arxiv.org/abs/2409.06214v3","updated":"2025-03-03T01:46:42Z","published":"2024-09-10T04:45:25Z","title":"Towards Generalizable Scene Change Detection","summary":" While current state-of-the-art Scene Change Detection (SCD) approaches\nachieve impressive results in well-trained research data, they become\nunreliable under unseen environments and different temporal conditions;\nin-domain performance drops from 77.6\\% to 8.0\\% in a previously unseen\nenvironment and to 4.6\\% under a different temporal condition -- calling for\ngeneralizable SCD and benchmark. In this work, we propose the Generalizable\nScene Change Detection Framework (GeSCF), which addresses unseen domain\nperformance and temporal consistency -- to meet the growing demand for anything\nSCD. Our method leverages the pre-trained Segment Anything Model (SAM) in a\nzero-shot manner. For this, we design Initial Pseudo-mask Generation and\nGeometric-Semantic Mask Matching -- seamlessly turning user-guided prompt and\nsingle-image based segmentation into scene change detection for a pair of\ninputs without guidance. Furthermore, we define the Generalizable Scene Change\nDetection (GeSCD) benchmark along with novel metrics and an evaluation protocol\nto facilitate SCD research in generalizability. In the process, we introduce\nthe ChangeVPR dataset, a collection of challenging image pairs with diverse\nenvironmental scenarios -- including urban, suburban, and rural settings.\nExtensive experiments across various datasets demonstrate that GeSCF achieves\nan average performance gain of 19.2\\% on existing SCD datasets and 30.0\\% on\nthe ChangeVPR dataset, nearly doubling the prior art performance. We believe\nour work can lay a solid foundation for robust and generalizable SCD research.\n","authors":["Jaewoo Kim","Uehwan Kim"],"pdf_url":"https://arxiv.org/pdf/2409.06214v3.pdf","comment":"Manuscript. Accepted to CVPR 2025"},{"id":"http://arxiv.org/abs/2409.02060v2","updated":"2025-03-03T01:25:46Z","published":"2024-09-03T17:08:20Z","title":"OLMoE: Open Mixture-of-Experts Language Models","summary":" We introduce OLMoE, a fully open, state-of-the-art language model leveraging\nsparse Mixture-of-Experts (MoE). OLMoE-1B-7B has 7 billion (B) parameters but\nuses only 1B per input token. We pretrain it on 5 trillion tokens and further\nadapt it to create OLMoE-1B-7B-Instruct. Our models outperform all available\nmodels with similar active parameters, even surpassing larger ones like\nLlama2-13B-Chat and DeepSeekMoE-16B. We present various experiments on MoE\ntraining, analyze routing in our model showing high specialization, and\nopen-source all aspects of our work: model weights, training data, code, and\nlogs.\n","authors":["Niklas Muennighoff","Luca Soldaini","Dirk Groeneveld","Kyle Lo","Jacob Morrison","Sewon Min","Weijia Shi","Pete Walsh","Oyvind Tafjord","Nathan Lambert","Yuling Gu","Shane Arora","Akshita Bhagia","Dustin Schwenk","David Wadden","Alexander Wettig","Binyuan Hui","Tim Dettmers","Douwe Kiela","Ali Farhadi","Noah A. Smith","Pang Wei Koh","Amanpreet Singh","Hannaneh Hajishirzi"],"pdf_url":"https://arxiv.org/pdf/2409.02060v2.pdf","comment":"63 pages (24 main), 36 figures, 17 tables"},{"id":"http://arxiv.org/abs/2407.10967v2","updated":"2025-03-03T01:19:23Z","published":"2024-07-15T17:59:23Z","title":"BECAUSE: Bilinear Causal Representation for Generalizable Offline\n Model-based Reinforcement Learning","summary":" Offline model-based reinforcement learning (MBRL) enhances data efficiency by\nutilizing pre-collected datasets to learn models and policies, especially in\nscenarios where exploration is costly or infeasible. Nevertheless, its\nperformance often suffers from the objective mismatch between model and policy\nlearning, resulting in inferior performance despite accurate model predictions.\nThis paper first identifies the primary source of this mismatch comes from the\nunderlying confounders present in offline data for MBRL. Subsequently, we\nintroduce \\textbf{B}ilin\\textbf{E}ar \\textbf{CAUS}al\nr\\textbf{E}presentation~(BECAUSE), an algorithm to capture causal\nrepresentation for both states and actions to reduce the influence of the\ndistribution shift, thus mitigating the objective mismatch problem.\nComprehensive evaluations on 18 tasks that vary in data quality and environment\ncontext demonstrate the superior performance of BECAUSE over existing offline\nRL algorithms. We show the generalizability and robustness of BECAUSE under\nfewer samples or larger numbers of confounders. Additionally, we offer\ntheoretical analysis of BECAUSE to prove its error bound and sample efficiency\nwhen integrating causal representation into offline MBRL.\n","authors":["Haohong Lin","Wenhao Ding","Jian Chen","Laixi Shi","Jiacheng Zhu","Bo Li","Ding Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.10967v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.01417v2","updated":"2025-03-03T00:41:36Z","published":"2024-10-02T10:58:54Z","title":"The Labyrinth of Links: Navigating the Associative Maze of Multi-modal\n LLMs","summary":" Multi-modal Large Language Models (MLLMs) have exhibited impressive\ncapability. However, recently many deficiencies of MLLMs have been found\ncompared to human intelligence, $\\textit{e.g.}$, hallucination. To drive the\nMLLMs study, the community dedicated efforts to building larger benchmarks with\ncomplex tasks. In this paper, we propose benchmarking an essential but usually\noverlooked intelligence: $\\textbf{association}$, a human's basic capability to\nlink observation and prior practice memory. To comprehensively investigate\nMLLM's performance on the association, we formulate the association task and\ndevise a standard benchmark based on adjective and verb semantic concepts.\nInstead of costly data annotation and curation, we propose a convenient\n$\\textbf{annotation-free}$ construction method transforming the general dataset\nfor our association tasks. Simultaneously, we devise a rigorous data refinement\nprocess to eliminate confusion in the raw dataset. Building on this database,\nwe establish three levels of association tasks: single-step, synchronous, and\nasynchronous associations. Moreover, we conduct a comprehensive investigation\ninto the MLLMs' zero-shot association capabilities, addressing multiple\ndimensions, including three distinct memory strategies, both open-source and\nclosed-source MLLMs, cutting-edge Mixture-of-Experts (MoE) models, and the\ninvolvement of human experts. Our systematic investigation shows that current\nopen-source MLLMs consistently exhibit poor capability in our association\ntasks, even the currently state-of-the-art GPT-4V(vision) also has a\nsignificant gap compared to humans. We believe our benchmark would pave the way\nfor future MLLM studies. $\\textit{Our data and code are available at:}$\nhttps://mvig-rhos.com/llm_inception.\n","authors":["Hong Li","Nanxi Li","Yuanjie Chen","Jianbin Zhu","Qinlu Guo","Cewu Lu","Yong-Lu Li"],"pdf_url":"https://arxiv.org/pdf/2410.01417v2.pdf","comment":"Accepted by ICLR 2025. Project page:\n https://mvig-rhos.com/llm_inception"},{"id":"http://arxiv.org/abs/2405.02318v2","updated":"2025-03-03T00:38:48Z","published":"2024-04-18T00:20:48Z","title":"NL2FOL: Translating Natural Language to First-Order Logic for Logical\n Fallacy Detection","summary":" Translating natural language into formal language such as First-Order Logic\n(FOL) is a foundational challenge in NLP with wide-ranging applications in\nautomated reasoning, misinformation tracking, and knowledge validation. In this\npaper, we introduce Natural Language to First-Order Logic (NL2FOL), a framework\nto autoformalize natural language to FOL step by step using Large Language\nModels (LLMs). Our approach addresses key challenges in this translation\nprocess, including the integration of implicit background knowledge. By\nleveraging structured representations generated by NL2FOL, we use\nSatisfiability Modulo Theory (SMT) solvers to reason about the logical validity\nof natural language statements. We present logical fallacy detection as a case\nstudy to evaluate the efficacy of NL2FOL. Being neurosymbolic, our approach\nalso provides interpretable insights into the reasoning process and\ndemonstrates robustness without requiring model fine-tuning or labeled training\ndata. Our framework achieves strong performance on multiple datasets. On the\nLOGIC dataset, NL2FOL achieves an F1-score of 78%, while generalizing\neffectively to the LOGICCLIMATE dataset with an F1-score of 80%.\n","authors":["Abhinav Lalwani","Tasha Kim","Lovish Chopra","Christopher Hahn","Zhijing Jin","Mrinmaya Sachan"],"pdf_url":"https://arxiv.org/pdf/2405.02318v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.15770v2","updated":"2025-03-03T00:24:08Z","published":"2025-02-16T08:52:45Z","title":"Performance Review on LLM for solving leetcode problems","summary":" This paper presents a comprehensive performance evaluation of Large Language\nModels (LLMs) in solving programming challenges from Leetcode, a widely used\nplatform for algorithm practice and technical interviews. We began by crawling\nthe Leetcode website to collect a diverse set of problems encompassing various\ndifficulty levels and topics. Using this dataset, we generated solutions with\nmultiple LLMs, including GPT-4 and GPT-3.5-turbo (ChatGPT-turbo). The generated\nsolutions were systematically evaluated for correctness and efficiency. We\nemployed the pass@k metric to assess the success rates within a given number of\nattempts and analyzed the runtime performance of the solutions. Our results\nhighlight the strengths and limitations of current LLMs [10] in code generation\nand problem-solving tasks, providing insights into their potential applications\nand areas for improvement in automated programming assistance.\n","authors":["Lun Wang","Chuanqi Shi","Shaoshui Du","Yiyi Tao","Yixian Shen","Hang Zheng","Yanxin Shen","Xinyu Qiu"],"pdf_url":"https://arxiv.org/pdf/2502.15770v2.pdf","comment":null}],"Genomics":[{"id":"http://arxiv.org/abs/2409.02143v2","updated":"2025-03-03T12:08:50Z","published":"2024-09-02T22:04:08Z","title":"MLOmics: Benchmark for Machine Learning on Cancer Multi-Omics Data","summary":" Framing the investigation of diverse cancers as a machine learning problem\nhas recently shown significant potential in multi-omics analysis and cancer\nresearch. Empowering these successful machine learning models are the\nhigh-quality training datasets with sufficient data volume and adequate\npreprocessing. However, while there exist several public data portals including\nThe Cancer Genome Atlas (TCGA) multi-omics initiative or open-bases such as the\nLinkedOmics, these databases are not off-the-shelf for existing machine\nlearning models. In this paper we propose MLOmics, an open cancer multi-omics\nbenchmark aiming at serving better the development and evaluation of\nbioinformatics and machine learning models. MLOmics contains 8,314 patient\nsamples covering all 32 cancer types with four omics types, stratified\nfeatures, and extensive baselines. Complementary support for downstream\nanalysis and bio-knowledge linking are also included to support\ninterdisciplinary analysis.\n","authors":["Ziwei Yang","Rikuto Kotoge","Xihao Piao","Zheng Chen","Lingwei Zhu","Peng Gao","Yasuko Matsubara","Yasushi Sakurai","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2409.02143v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2309.13838v2","updated":"2025-03-03T01:47:00Z","published":"2023-09-25T02:50:22Z","title":"Penalized Principal Component Analysis Using Smoothing","summary":" Principal components computed via PCA (principal component analysis) are\ntraditionally used to reduce dimensionality in genomic data or to correct for\npopulation stratification. In this paper, we explore the penalized eigenvalue\nproblem (PEP) which reformulates the computation of the first eigenvector as an\noptimization problem and adds an $L_1$ penalty constraint to enforce sparseness\nof the solution. The contribution of our article is threefold. First, we extend\nPEP by applying smoothing to the original LASSO-type $L_1$ penalty. This allows\none to compute analytical gradients which enable faster and more efficient\nminimization of the objective function associated with the optimization\nproblem. Second, we demonstrate how higher order eigenvectors can be calculated\nwith PEP using established results from singular value decomposition (SVD).\nThird, we present four experimental studies to demonstrate the usefulness of\nthe smoothed penalized eigenvectors. Using data from the 1000 Genomes Project\ndataset, we empirically demonstrate that our proposed smoothed PEP allows one\nto increase numerical stability and obtain meaningful eigenvectors. We also\nemploy the penalized eigenvector approach in two additional real data\napplications (computation of a polygenic risk score and clustering),\ndemonstrating that exchanging the penalized eigenvectors for their smoothed\ncounterparts can increase prediction accuracy in polygenic risk scores and\nenhance discernibility of clusterings. Moreover, we compare our proposed\nsmoothed PEP to seven state-of-the-art algorithms for sparse PCA and evaluate\nthe accuracy of the obtained eigenvectors, their support recovery, and their\nruntime.\n","authors":["Rebecca M. Hurwitz","Georg Hahn"],"pdf_url":"https://arxiv.org/pdf/2309.13838v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05998v3","updated":"2025-03-03T21:31:23Z","published":"2024-05-09T09:34:51Z","title":"Whole Genome Transformer for Gene Interaction Effects in Microbiome\n Habitat Specificity","summary":" Leveraging the vast genetic diversity within microbiomes offers unparalleled\ninsights into complex phenotypes, yet the task of accurately predicting and\nunderstanding such traits from genomic data remains challenging. We propose a\nframework taking advantage of existing large models for gene vectorization to\npredict habitat specificity from entire microbial genome sequences. Based on\nour model, we develop attribution techniques to elucidate gene interaction\neffects that drive microbial adaptation to diverse environments. We train and\nvalidate our approach on a large dataset of high quality microbiome genomes\nfrom different habitats. We not only demonstrate solid predictive performance,\nbut also how sequence-level information of entire genomes allows us to identify\ngene associations underlying complex phenotypes. Our attribution recovers known\nimportant interaction networks and proposes new candidates for experimental\nfollow up.\n","authors":["Zhufeng Li","Sandeep S Cranganore","Nicholas Youngblut","Niki Kilbertus"],"pdf_url":"https://arxiv.org/pdf/2405.05998v3.pdf","comment":"published at AAAI 2025"},{"id":"http://arxiv.org/abs/2503.01994v1","updated":"2025-03-03T19:11:45Z","published":"2025-03-03T19:11:45Z","title":"Fungal Genetic Variants in Oceanic Environments","summary":" Comparing specific types of organisms as they are found across environmental\nconditions has helped inform how genes and gene products of these organisms\nrelate to phenotypes and adaptation. In this study, we examine\nmetatranscriptomic data as found for oceanic fungi across different oceanic\nsampling sites. A specific set of three genes was chosen for evaluation based\non conserved orthology, known association with core physiological processes in\nfungi, and level of abundance within oceanic metatranscriptomic data. We report\nupon a potential association of genetic variance with environmental conditions\nof iron, salt and phosphate in oceanic waters based on heatmap visualization\nand PERMANOVA analysis.\n","authors":["Sade A. Davenport","Scott H. Harrison"],"pdf_url":"https://arxiv.org/pdf/2503.01994v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2407.11734v2","updated":"2025-03-03T14:24:06Z","published":"2024-07-16T14:05:03Z","title":"Multi-Modal and Multi-Attribute Generation of Single Cells with CFGen","summary":" Generative modeling of single-cell RNA-seq data is crucial for tasks like\ntrajectory inference, batch effect removal, and simulation of realistic\ncellular data. However, recent deep generative models simulating synthetic\nsingle cells from noise operate on pre-processed continuous gene expression\napproximations, overlooking the discrete nature of single-cell data, which\nlimits their effectiveness and hinders the incorporation of robust noise\nmodels. Additionally, aspects like controllable multi-modal and multi-label\ngeneration of cellular data remain underexplored. This work introduces CellFlow\nfor Generation (CFGen), a flow-based conditional generative model that\npreserves the inherent discreteness of single-cell data. CFGen generates\nwhole-genome multi-modal single-cell data reliably, improving the recovery of\ncrucial biological data characteristics while tackling relevant generative\ntasks such as rare cell type augmentation and batch correction. We also\nintroduce a novel framework for compositional data generation using Flow\nMatching. By showcasing CFGen on a diverse set of biological datasets and\nsettings, we provide evidence of its value to the fields of computational\nbiology and deep generative models.\n","authors":["Alessandro Palma","Till Richter","Hanyi Zhang","Manuel Lubetzki","Alexander Tong","Andrea Dittadi","Fabian Theis"],"pdf_url":"https://arxiv.org/pdf/2407.11734v2.pdf","comment":"41 pages, 22 figures"},{"id":"http://arxiv.org/abs/2503.01459v1","updated":"2025-03-03T12:17:19Z","published":"2025-03-03T12:17:19Z","title":"Primer C-VAE: An interpretable deep learning primer design method to\n detect emerging virus variants","summary":" Motivation: PCR is more economical and quicker than Next Generation\nSequencing for detecting target organisms, with primer design being a critical\nstep. In epidemiology with rapidly mutating viruses, designing effective\nprimers is challenging. Traditional methods require substantial manual\nintervention and struggle to ensure effective primer design across different\nstrains. For organisms with large, similar genomes like Escherichia coli and\nShigella flexneri, differentiating between species is also difficult but\ncrucial.\n Results: We developed Primer C-VAE, a model based on a Variational\nAuto-Encoder framework with Convolutional Neural Networks to identify variants\nand generate specific primers. Using SARS-CoV-2, our model classified variants\n(alpha, beta, gamma, delta, omicron) with 98% accuracy and generated\nvariant-specific primers. These primers appeared with >95% frequency in target\nvariants and <5% in others, showing good performance in in-silico PCR tests.\nFor Alpha, Delta, and Omicron, our primer pairs produced fragments <200 bp,\nsuitable for qPCR detection. The model also generated effective primers for\norganisms with longer gene sequences like E. coli and S. flexneri.\n Conclusion: Primer C-VAE is an interpretable deep learning approach for\ndeveloping specific primer pairs for target organisms. This flexible,\nsemi-automated and reliable tool works regardless of sequence completeness and\nlength, allowing for qPCR applications and can be applied to organisms with\nlarge and highly similar genomes.\n","authors":["Hanyu Wang","Emmanuel K. Tsinda","Anthony J. Dunn","Francis Chikweto","Alain B. Zemkoho"],"pdf_url":"https://arxiv.org/pdf/2503.01459v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2502.19210v2","updated":"2025-03-03T15:32:09Z","published":"2025-02-26T15:13:08Z","title":"Langevin Multiplicative Weights Update with Applications in Polynomial\n Portfolio Management","summary":" We consider nonconvex optimization problem over simplex, and more generally,\na product of simplices. We provide an algorithm, Langevin Multiplicative\nWeights Update (LMWU) for solving global optimization problems by adding a\nnoise scaling with the non-Euclidean geometry in the simplex. Non-convex\noptimization has been extensively studied by machine learning community due to\nits application in various scenarios such as neural network approximation and\nfinding Nash equilibrium. Despite recent progresses on provable guarantee of\nescaping and avoiding saddle point (convergence to local minima) and global\nconvergence of Langevin gradient based method without constraints, the global\noptimization with constraints is less studied. We show that LMWU algorithm is\nprovably convergent to interior global minima with a non-asymptotic convergence\nanalysis. We verify the efficiency of the proposed algorithm in real data set\nfrom polynomial portfolio management, where optimization of a highly non-linear\nobjective function plays a crucial role.\n","authors":["Yi Feng","Xiao Wang","Tian Xie"],"pdf_url":"https://arxiv.org/pdf/2502.19210v2.pdf","comment":"Accepted for AAAI-2025"},{"id":"http://arxiv.org/abs/2502.12215v2","updated":"2025-03-03T15:29:43Z","published":"2025-02-17T07:21:11Z","title":"Revisiting the Test-Time Scaling of o1-like Models: Do they Truly\n Possess Test-Time Scaling Capabilities?","summary":" The advent of test-time scaling in large language models (LLMs), exemplified\nby OpenAI's o1 series, has advanced reasoning capabilities by scaling\ncomputational resource allocation during inference. While successors like QwQ,\nDeepseek-R1 (R1) and LIMO replicate these advancements, whether these models\ntruly possess test-time scaling capabilities remains underexplored. This study\nfound that longer CoTs of these o1-like models do not consistently enhance\naccuracy; in fact, correct solutions are often shorter than incorrect ones for\nthe same questions. Further investigation shows this phenomenon is closely\nrelated to models' self-revision capabilities - longer CoTs contain more\nself-revisions, which often lead to performance degradation. We then compare\nsequential and parallel scaling strategies on QwQ, R1 and LIMO, finding that\nparallel scaling achieves better coverage and scalability. Based on these\ninsights, we propose Shortest Majority Vote, a method that combines parallel\nscaling strategies with CoT length characteristics, significantly improving\nmodels' test-time scalability compared to conventional majority voting\napproaches.\n","authors":["Zhiyuan Zeng","Qinyuan Cheng","Zhangyue Yin","Yunhua Zhou","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2502.12215v2.pdf","comment":"Add the github link"},{"id":"http://arxiv.org/abs/2501.10945v2","updated":"2025-03-03T15:09:31Z","published":"2025-01-19T04:56:55Z","title":"Gradient-Based Multi-Objective Deep Learning: Algorithms, Theories,\n Applications, and Beyond","summary":" Multi-objective optimization (MOO) in deep learning aims to simultaneously\noptimize multiple conflicting objectives, a challenge frequently encountered in\nareas like multi-task learning and multi-criteria learning. Recent advancements\nin gradient-based MOO methods have enabled the discovery of diverse types of\nsolutions, ranging from a single balanced solution to finite or even infinite\nPareto sets, tailored to user needs. These developments have broad applications\nacross domains such as reinforcement learning, computer vision, recommendation\nsystems, and large language models. This survey provides the first\ncomprehensive review of gradient-based MOO in deep learning, covering\nalgorithms, theories, and practical applications. By unifying various\napproaches and identifying critical challenges, it serves as a foundational\nresource for driving innovation in this evolving field. A comprehensive list of\nMOO algorithms in deep learning is available at\nhttps://github.com/Baijiong-Lin/Awesome-Multi-Objective-Deep-Learning.\n","authors":["Weiyu Chen","Xiaoyuan Zhang","Baijiong Lin","Xi Lin","Han Zhao","Qingfu Zhang","James T. Kwok"],"pdf_url":"https://arxiv.org/pdf/2501.10945v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.10784v2","updated":"2025-03-03T15:02:37Z","published":"2025-02-15T12:28:51Z","title":"Preconditioned Inexact Stochastic ADMM for Deep Model","summary":" The recent advancement of foundation models (FMs) has brought about a\nparadigm shift, revolutionizing various sectors worldwide. The popular\noptimizers used to train these models are stochastic gradient descent-based\nalgorithms, which face inherent limitations, such as slow convergence and\nstringent assumptions for convergence. In particular, data heterogeneity\narising from distributed settings poses significant challenges to their\ntheoretical and numerical performance. This paper develops an algorithm, PISA\n({P}reconditioned {I}nexact {S}tochastic {A}lternating Direction Method of\nMultipliers), which enables scalable parallel computing and supports various\nsecond-moment schemes. Grounded in rigorous theoretical guarantees, the\nalgorithm converges under the sole assumption of Lipschitz continuity of the\ngradient, thereby removing the need for other conditions commonly imposed by\nstochastic methods. This capability enables PISA to tackle the challenge of\ndata heterogeneity effectively. Comprehensive experimental evaluations for\ntraining or fine-tuning diverse FMs, including vision models, large language\nmodels, reinforcement learning models, generative adversarial networks, and\nrecurrent neural networks, demonstrate its superior numerical performance\ncompared to various state-of-the-art optimizers.\n","authors":["Shenglong Zhou","Ouya Wang","Ziyan Luo","Yongxu Zhu","Geoffrey Ye Li"],"pdf_url":"https://arxiv.org/pdf/2502.10784v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23208v2","updated":"2025-03-03T14:29:16Z","published":"2024-10-30T16:59:41Z","title":"Kinetix: Investigating the Training of General Agents through Open-Ended\n Physics-Based Control Tasks","summary":" While large models trained with self-supervised learning on offline datasets\nhave shown remarkable capabilities in text and image domains, achieving the\nsame generalisation for agents that act in sequential decision problems remains\nan open challenge. In this work, we take a step towards this goal by\nprocedurally generating tens of millions of 2D physics-based tasks and using\nthese to train a general reinforcement learning (RL) agent for physical\ncontrol. To this end, we introduce Kinetix: an open-ended space of\nphysics-based RL environments that can represent tasks ranging from robotic\nlocomotion and grasping to video games and classic RL environments, all within\na unified framework. Kinetix makes use of our novel hardware-accelerated\nphysics engine Jax2D that allows us to cheaply simulate billions of environment\nsteps during training. Our trained agent exhibits strong physical reasoning\ncapabilities in 2D space, being able to zero-shot solve unseen human-designed\nenvironments. Furthermore, fine-tuning this general agent on tasks of interest\nshows significantly stronger performance than training an RL agent *tabula\nrasa*. This includes solving some environments that standard RL training\ncompletely fails at. We believe this demonstrates the feasibility of large\nscale, mixed-quality pre-training for online RL and we hope that Kinetix will\nserve as a useful framework to investigate this further.\n","authors":["Michael Matthews","Michael Beukman","Chris Lu","Jakob Foerster"],"pdf_url":"https://arxiv.org/pdf/2410.23208v2.pdf","comment":"ICLR 2025 Oral. The first two authors contributed equally. Project\n page located at: https://kinetix-env.github.io/"},{"id":"http://arxiv.org/abs/2410.15474v2","updated":"2025-03-03T14:08:48Z","published":"2024-10-20T19:12:14Z","title":"Optimizing Backward Policies in GFlowNets via Trajectory Likelihood\n Maximization","summary":" Generative Flow Networks (GFlowNets) are a family of generative models that\nlearn to sample objects with probabilities proportional to a given reward\nfunction. The key concept behind GFlowNets is the use of two stochastic\npolicies: a forward policy, which incrementally constructs compositional\nobjects, and a backward policy, which sequentially deconstructs them. Recent\nresults show a close relationship between GFlowNet training and\nentropy-regularized reinforcement learning (RL) problems with a particular\nreward design. However, this connection applies only in the setting of a fixed\nbackward policy, which might be a significant limitation. As a remedy to this\nproblem, we introduce a simple backward policy optimization algorithm that\ninvolves direct maximization of the value function in an entropy-regularized\nMarkov Decision Process (MDP) over intermediate rewards. We provide an\nextensive experimental evaluation of the proposed approach across various\nbenchmarks in combination with both RL and GFlowNet algorithms and demonstrate\nits faster convergence and mode discovery in complex environments.\n","authors":["Timofei Gritsaev","Nikita Morozov","Sergey Samsonov","Daniil Tiapkin"],"pdf_url":"https://arxiv.org/pdf/2410.15474v2.pdf","comment":"ICLR 2025"},{"id":"http://arxiv.org/abs/2501.07596v2","updated":"2025-03-03T13:27:01Z","published":"2025-01-10T01:42:43Z","title":"Optimize Incompatible Parameters through Compatibility-aware Knowledge\n Integration","summary":" Deep neural networks have become foundational to advancements in multiple\ndomains, including recommendation systems, natural language processing, and so\non. Despite their successes, these models often contain incompatible parameters\nthat can be underutilized or detrimental to model performance, particularly\nwhen faced with specific, varying data distributions. Existing research excels\nin removing such parameters or merging the outputs of multiple different\npretrained models. However, the former focuses on efficiency rather than\nperformance, while the latter requires several times more computing and storage\nresources to support inference. In this paper, we set the goal to explicitly\nimprove these incompatible parameters by leveraging the complementary strengths\nof different models, thereby directly enhancing the models without any\nadditional parameters. Specifically, we propose Compatibility-aware Knowledge\nIntegration (CKI), which consists of Parameter Compatibility Assessment and\nParameter Splicing, which are used to evaluate the knowledge content of\nmultiple models and integrate the knowledge into one model, respectively. The\nintegrated model can be used directly for inference or for further fine-tuning.\nWe conduct extensive experiments on various datasets for recommendation and\nlanguage tasks, and the results show that Compatibility-aware Knowledge\nIntegration can effectively optimize incompatible parameters under multiple\ntasks and settings to break through the training limit of the original model\nwithout increasing the inference cost.\n","authors":["Zheqi Lv","Keming Ye","Zishu Wei","Qi Tian","Shengyu Zhang","Wenqiao Zhang","Wenjie Wang","Kun Kuang","Tat-Seng Chua","Fei Wu"],"pdf_url":"https://arxiv.org/pdf/2501.07596v2.pdf","comment":"Published on AAAI'25(Oral): The Annual AAAI Conference on Artificial\n Intelligence"},{"id":"http://arxiv.org/abs/2411.17711v2","updated":"2025-03-03T13:19:42Z","published":"2024-11-17T17:32:58Z","title":"AnyECG: Foundational Models for Multitask Cardiac Analysis in Real-World\n Settings","summary":" Electrocardiogram (ECG), a non-invasive and affordable tool for cardiac\nmonitoring, is highly sensitive in detecting acute heart attacks. However, due\nto the lengthy nature of ECG recordings, numerous machine learning methods have\nbeen developed for automated heart disease detection to reduce human workload.\nDespite these efforts, performance remains suboptimal. A key obstacle is the\ninherent complexity of ECG data, which includes heterogeneity (e.g., varying\nsampling rates), high levels of noise, demographic-related pattern shifts, and\nintricate rhythm-event associations. To overcome these challenges, this paper\nintroduces AnyECG, a foundational model designed to extract robust\nrepresentations from any real-world ECG data. Specifically, a tailored ECG\nTokenizer encodes each fixed-duration ECG fragment into a token and, guided by\nproxy tasks, converts noisy, continuous ECG features into discrete, compact,\nand clinically meaningful local rhythm codes. These codes encapsulate basic\nmorphological, frequency, and demographic information (e.g., sex), effectively\nmitigating signal noise. We further pre-train the AnyECG to learn rhythmic\npattern associations across ECG tokens, enabling the capture of cardiac event\nsemantics. By being jointly pre-trained on diverse ECG data sources, AnyECG is\ncapable of generalizing across a wide range of downstream tasks where ECG\nsignals are recorded from various devices and scenarios. The experimental\nresults show that AnyECG achieves an average performance improvement of 6%\nacross four critical tasks-anomaly detection, arrhythmia classification,\ncorrupted lead generation, and ultra-long ECG recognition. AnyECG learns common\nECG rhythm from data and significantly outperforms state-of-the-art methods in\neach of these tasks.\n","authors":["Yue Wang","Xu Cao","Yaojun Hu","Haochao Ying","Hongxia Xu","Ruijia Wu","James Matthew Rehg","Jimeng Sun","Jian Wu","Jintai Chen"],"pdf_url":"https://arxiv.org/pdf/2411.17711v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05106v2","updated":"2025-03-03T13:18:55Z","published":"2024-10-07T15:02:48Z","title":"Nonasymptotic Analysis of Stochastic Gradient Descent with the\n Richardson-Romberg Extrapolation","summary":" We address the problem of solving strongly convex and smooth minimization\nproblems using stochastic gradient descent (SGD) algorithm with a constant step\nsize. Previous works suggested to combine the Polyak-Ruppert averaging\nprocedure with the Richardson-Romberg extrapolation to reduce the asymptotic\nbias of SGD at the expense of a mild increase of the variance. We significantly\nextend previous results by providing an expansion of the mean-squared error of\nthe resulting estimator with respect to the number of iterations $n$. We show\nthat the root mean-squared error can be decomposed into the sum of two terms: a\nleading one of order $\\mathcal{O}(n^{-1/2})$ with explicit dependence on a\nminimax-optimal asymptotic covariance matrix, and a second-order term of order\n$\\mathcal{O}(n^{-3/4})$, where the power $3/4$ is best known. We also extend\nthis result to the higher-order moment bounds. Our analysis relies on the\nproperties of the SGD iterates viewed as a time-homogeneous Markov chain. In\nparticular, we establish that this chain is geometrically ergodic with respect\nto a suitably defined weighted Wasserstein semimetric.\n","authors":["Marina Sheshukova","Denis Belomestny","Alain Durmus","Eric Moulines","Alexey Naumov","Sergey Samsonov"],"pdf_url":"https://arxiv.org/pdf/2410.05106v2.pdf","comment":"ICLR-2025, camera-ready version"},{"id":"http://arxiv.org/abs/2410.07076v4","updated":"2025-03-03T13:17:24Z","published":"2024-10-09T17:19:58Z","title":"MOOSE-Chem: Large Language Models for Rediscovering Unseen Chemistry\n Scientific Hypotheses","summary":" Scientific discovery contributes largely to human society's prosperity, and\nrecent progress shows that LLMs could potentially catalyze this process.\nHowever, it is still unclear whether LLMs can discover novel and valid\nhypotheses in chemistry. In this work, we investigate this central research\nquestion: Can LLMs automatically discover novel and valid chemistry research\nhypotheses given only a chemistry research background (consisting of a research\nquestion and/or a background survey), without limitation on the domain of the\nresearch question? After extensive discussions with chemistry experts, we\npropose an assumption that a majority of chemistry hypotheses can be resulted\nfrom a research background and several inspirations. With this key insight, we\nbreak the central question into three smaller fundamental questions. In brief,\nthey are: (1) given a background question, whether LLMs can retrieve good\ninspirations; (2) with background and inspirations, whether LLMs can lead to\nhypothesis; and (3) whether LLMs can identify good hypotheses to rank them\nhigher. To investigate these questions, we construct a benchmark consisting of\n51 chemistry papers published in Nature, Science, or a similar level in 2024\n(all papers are only available online since 2024). Every paper is divided by\nchemistry PhD students into three components: background, inspirations, and\nhypothesis. The goal is to rediscover the hypothesis, given only the background\nand a large randomly selected chemistry literature corpus consisting the ground\ntruth inspiration papers, with LLMs trained with data up to 2023. We also\ndevelop an LLM-based multi-agent framework that leverages the assumption,\nconsisting of three stages reflecting the three smaller questions. The proposed\nmethod can rediscover many hypotheses with very high similarity with the ground\ntruth ones, covering the main innovations.\n","authors":["Zonglin Yang","Wanhao Liu","Ben Gao","Tong Xie","Yuqiang Li","Wanli Ouyang","Soujanya Poria","Erik Cambria","Dongzhan Zhou"],"pdf_url":"https://arxiv.org/pdf/2410.07076v4.pdf","comment":"Accepted by ICLR 2025"},{"id":"http://arxiv.org/abs/2412.16577v2","updated":"2025-03-03T12:21:35Z","published":"2024-12-21T10:52:56Z","title":"A Meta-Learning Approach to Bayesian Causal Discovery","summary":" Discovering a unique causal structure is difficult due to both inherent\nidentifiability issues, and the consequences of finite data. As such,\nuncertainty over causal structures, such as those obtained from a Bayesian\nposterior, are often necessary for downstream tasks. Finding an accurate\napproximation to this posterior is challenging, due to the large number of\npossible causal graphs, as well as the difficulty in the subproblem of finding\nposteriors over the functional relationships of the causal edges. Recent works\nhave used meta-learning to view the problem of estimating the maximum\na-posteriori causal graph as supervised learning. Yet, these methods are\nlimited when estimating the full posterior as they fail to encode key\nproperties of the posterior, such as correlation between edges and permutation\nequivariance with respect to nodes. Further, these methods also cannot reliably\nsample from the posterior over causal structures. To address these limitations,\nwe propose a Bayesian meta learning model that allows for sampling causal\nstructures from the posterior and encodes these key properties. We compare our\nmeta-Bayesian causal discovery against existing Bayesian causal discovery\nmethods, demonstrating the advantages of directly learning a posterior over\ncausal structure.\n","authors":["Anish Dhir","Matthew Ashman","James Requeima","Mark van der Wilk"],"pdf_url":"https://arxiv.org/pdf/2412.16577v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.08190v2","updated":"2025-03-03T12:18:29Z","published":"2024-10-10T17:57:29Z","title":"Poison-splat: Computation Cost Attack on 3D Gaussian Splatting","summary":" 3D Gaussian splatting (3DGS), known for its groundbreaking performance and\nefficiency, has become a dominant 3D representation and brought progress to\nmany 3D vision tasks. However, in this work, we reveal a significant security\nvulnerability that has been largely overlooked in 3DGS: the computation cost of\ntraining 3DGS could be maliciously tampered by poisoning the input data. By\ndeveloping an attack named Poison-splat, we reveal a novel attack surface where\nthe adversary can poison the input images to drastically increase the\ncomputation memory and time needed for 3DGS training, pushing the algorithm\ntowards its worst computation complexity. In extreme cases, the attack can even\nconsume all allocable memory, leading to a Denial-of-Service (DoS) that\ndisrupts servers, resulting in practical damages to real-world 3DGS service\nvendors. Such a computation cost attack is achieved by addressing a bi-level\noptimization problem through three tailored strategies: attack objective\napproximation, proxy model rendering, and optional constrained optimization.\nThese strategies not only ensure the effectiveness of our attack but also make\nit difficult to defend with simple defensive measures. We hope the revelation\nof this novel attack surface can spark attention to this crucial yet overlooked\nvulnerability of 3DGS systems. Our code is available at\nhttps://github.com/jiahaolu97/poison-splat .\n","authors":["Jiahao Lu","Yifan Zhang","Qiuhong Shen","Xinchao Wang","Shuicheng Yan"],"pdf_url":"https://arxiv.org/pdf/2410.08190v2.pdf","comment":"Accepted by ICLR 2025 as a spotlight paper"},{"id":"http://arxiv.org/abs/2410.00722v2","updated":"2025-03-03T12:18:16Z","published":"2024-10-01T14:13:05Z","title":"On the Geometry and Optimization of Polynomial Convolutional Networks","summary":" We study convolutional neural networks with monomial activation functions.\nSpecifically, we prove that their parameterization map is regular and is an\nisomorphism almost everywhere, up to rescaling the filters. By leveraging on\ntools from algebraic geometry, we explore the geometric properties of the image\nin function space of this map - typically referred to as neuromanifold. In\nparticular, we compute the dimension and the degree of the neuromanifold, which\nmeasure the expressivity of the model, and describe its singularities.\nMoreover, for a generic large dataset, we derive an explicit formula that\nquantifies the number of critical points arising in the optimization of a\nregression loss.\n","authors":["Vahid Shahverdi","Giovanni Luca Marchetti","Kathlén Kohn"],"pdf_url":"https://arxiv.org/pdf/2410.00722v2.pdf","comment":"Accepted at AISTATS 2025"},{"id":"http://arxiv.org/abs/2410.12343v3","updated":"2025-03-03T12:15:38Z","published":"2024-10-16T08:04:57Z","title":"Federated Temporal Graph Clustering","summary":" Temporal graph clustering is a complex task that involves discovering\nmeaningful structures in dynamic graphs where relationships and entities change\nover time. Existing methods typically require centralized data collection,\nwhich poses significant privacy and communication challenges. In this work, we\nintroduce a novel Federated Temporal Graph Clustering (FTGC) framework that\nenables decentralized training of graph neural networks (GNNs) across multiple\nclients, ensuring data privacy throughout the process. Our approach\nincorporates a temporal aggregation mechanism to effectively capture the\nevolution of graph structures over time and a federated optimization strategy\nto collaboratively learn high-quality clustering representations. By preserving\ndata privacy and reducing communication overhead, our framework achieves\ncompetitive performance on temporal graph datasets, making it a promising\nsolution for privacy-sensitive, real-world applications involving dynamic data.\n","authors":["Zihao Zhou","Yang Liu","Xianghong Xu","Qian Li"],"pdf_url":"https://arxiv.org/pdf/2410.12343v3.pdf","comment":"8 pages, 1 figure"},{"id":"http://arxiv.org/abs/2409.02143v2","updated":"2025-03-03T12:08:50Z","published":"2024-09-02T22:04:08Z","title":"MLOmics: Benchmark for Machine Learning on Cancer Multi-Omics Data","summary":" Framing the investigation of diverse cancers as a machine learning problem\nhas recently shown significant potential in multi-omics analysis and cancer\nresearch. Empowering these successful machine learning models are the\nhigh-quality training datasets with sufficient data volume and adequate\npreprocessing. However, while there exist several public data portals including\nThe Cancer Genome Atlas (TCGA) multi-omics initiative or open-bases such as the\nLinkedOmics, these databases are not off-the-shelf for existing machine\nlearning models. In this paper we propose MLOmics, an open cancer multi-omics\nbenchmark aiming at serving better the development and evaluation of\nbioinformatics and machine learning models. MLOmics contains 8,314 patient\nsamples covering all 32 cancer types with four omics types, stratified\nfeatures, and extensive baselines. Complementary support for downstream\nanalysis and bio-knowledge linking are also included to support\ninterdisciplinary analysis.\n","authors":["Ziwei Yang","Rikuto Kotoge","Xihao Piao","Zheng Chen","Lingwei Zhu","Peng Gao","Yasuko Matsubara","Yasushi Sakurai","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2409.02143v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2502.17941v2","updated":"2025-03-03T12:00:57Z","published":"2025-02-25T08:03:04Z","title":"Optimal Brain Apoptosis","summary":" The increasing complexity and parameter count of Convolutional Neural\nNetworks (CNNs) and Transformers pose challenges in terms of computational\nefficiency and resource demands. Pruning has been identified as an effective\nstrategy to address these challenges by removing redundant elements such as\nneurons, channels, or connections, thereby enhancing computational efficiency\nwithout heavily compromising performance. This paper builds on the foundational\nwork of Optimal Brain Damage (OBD) by advancing the methodology of parameter\nimportance estimation using the Hessian matrix. Unlike previous approaches that\nrely on approximations, we introduce Optimal Brain Apoptosis (OBA), a novel\npruning method that calculates the Hessian-vector product value directly for\neach parameter. By decomposing the Hessian matrix across network layers and\nidentifying conditions under which inter-layer Hessian submatrices are\nnon-zero, we propose a highly efficient technique for computing the\nsecond-order Taylor expansion of parameters. This approach allows for a more\nprecise pruning process, particularly in the context of CNNs and Transformers,\nas validated in our experiments including VGG19, ResNet32, ResNet50, and\nViT-B/16 on CIFAR10, CIFAR100 and Imagenet datasets. Our code is available at\nhttps://github.com/NEU-REAL/OBA.\n","authors":["Mingyuan Sun","Zheng Fang","Jiaxu Wang","Junjie Jiang","Delei Kong","Chenming Hu","Yuetong Fang","Renjing Xu"],"pdf_url":"https://arxiv.org/pdf/2502.17941v2.pdf","comment":"Accepted to ICLR 2025"},{"id":"http://arxiv.org/abs/2407.15589v5","updated":"2025-03-03T11:48:03Z","published":"2024-07-22T12:26:08Z","title":"Exploring the Effectiveness of Object-Centric Representations in Visual\n Question Answering: Comparative Insights with Foundation Models","summary":" Object-centric (OC) representations, which model visual scenes as\ncompositions of discrete objects, have the potential to be used in various\ndownstream tasks to achieve systematic compositional generalization and\nfacilitate reasoning. However, these claims have yet to be thoroughly validated\nempirically. Recently, foundation models have demonstrated unparalleled\ncapabilities across diverse domains, from language to computer vision,\npositioning them as a potential cornerstone of future research for a wide range\nof computational tasks. In this paper, we conduct an extensive empirical study\non representation learning for downstream Visual Question Answering (VQA),\nwhich requires an accurate compositional understanding of the scene. We\nthoroughly investigate the benefits and trade-offs of OC models and alternative\napproaches including large pre-trained foundation models on both synthetic and\nreal-world data, ultimately identifying a promising path to leverage the\nstrengths of both paradigms. The extensiveness of our study, encompassing over\n600 downstream VQA models and 15 different types of upstream representations,\nalso provides several additional insights that we believe will be of interest\nto the community at large.\n","authors":["Amir Mohammad Karimi Mamaghan","Samuele Papa","Karl Henrik Johansson","Stefan Bauer","Andrea Dittadi"],"pdf_url":"https://arxiv.org/pdf/2407.15589v5.pdf","comment":"Published at ICLR 2025"},{"id":"http://arxiv.org/abs/2405.16195v3","updated":"2025-03-03T11:39:53Z","published":"2024-05-25T11:57:43Z","title":"Adaptive $Q$-Network: On-the-fly Target Selection for Deep Reinforcement\n Learning","summary":" Deep Reinforcement Learning (RL) is well known for being highly sensitive to\nhyperparameters, requiring practitioners substantial efforts to optimize them\nfor the problem at hand. This also limits the applicability of RL in real-world\nscenarios. In recent years, the field of automated Reinforcement Learning\n(AutoRL) has grown in popularity by trying to address this issue. However,\nthese approaches typically hinge on additional samples to select\nwell-performing hyperparameters, hindering sample-efficiency and practicality.\nFurthermore, most AutoRL methods are heavily based on already existing AutoML\nmethods, which were originally developed neglecting the additional challenges\ninherent to RL due to its non-stationarities. In this work, we propose a new\napproach for AutoRL, called Adaptive $Q$-Network (AdaQN), that is tailored to\nRL to take into account the non-stationarity of the optimization procedure\nwithout requiring additional samples. AdaQN learns several $Q$-functions, each\none trained with different hyperparameters, which are updated online using the\n$Q$-function with the smallest approximation error as a shared target. Our\nselection scheme simultaneously handles different hyperparameters while coping\nwith the non-stationarity induced by the RL optimization procedure and being\northogonal to any critic-based RL algorithm. We demonstrate that AdaQN is\ntheoretically sound and empirically validate it in MuJoCo control problems and\nAtari $2600$ games, showing benefits in sample-efficiency, overall performance,\nrobustness to stochasticity and training stability.\n","authors":["Théo Vincent","Fabian Wahren","Jan Peters","Boris Belousov","Carlo D'Eramo"],"pdf_url":"https://arxiv.org/pdf/2405.16195v3.pdf","comment":"Accepted at ICLR https://iclr.cc/virtual/2025/poster/28508"},{"id":"http://arxiv.org/abs/2410.11502v2","updated":"2025-03-03T11:38:11Z","published":"2024-10-15T11:15:03Z","title":"Offline Model-Based Optimization by Learning to Rank","summary":" Offline model-based optimization (MBO) aims to identify a design that\nmaximizes a black-box function using only a fixed, pre-collected dataset of\ndesigns and their corresponding scores. A common approach in offline MBO is to\ntrain a regression-based surrogate model by minimizing mean squared error (MSE)\nand then find the best design within this surrogate model by different\noptimizers (e.g., gradient ascent). However, a critical challenge is the risk\nof out-of-distribution errors, i.e., the surrogate model may typically\noverestimate the scores and mislead the optimizers into suboptimal regions.\nPrior works have attempted to address this issue in various ways, such as using\nregularization techniques and ensemble learning to enhance the robustness of\nthe model, but it still remains. In this paper, we argue that regression models\ntrained with MSE are not well-aligned with the primary goal of offline MBO,\nwhich is to select promising designs rather than to predict their scores\nprecisely. Notably, if a surrogate model can maintain the order of candidate\ndesigns based on their relative score relationships, it can produce the best\ndesigns even without precise predictions. To validate it, we conduct\nexperiments to compare the relationship between the quality of the final\ndesigns and MSE, finding that the correlation is really very weak. In contrast,\na metric that measures order-maintaining quality shows a significantly stronger\ncorrelation. Based on this observation, we propose learning a ranking-based\nmodel that leverages learning to rank techniques to prioritize promising\ndesigns based on their relative scores. We show that the generalization error\non ranking loss can be well bounded. Empirical results across diverse tasks\ndemonstrate the superior performance of our proposed ranking-based models than\ntwenty existing methods.\n","authors":["Rong-Xi Tan","Ke Xue","Shen-Huan Lyu","Haopu Shang","Yao Wang","Yaoyuan Wang","Sheng Fu","Chao Qian"],"pdf_url":"https://arxiv.org/pdf/2410.11502v2.pdf","comment":"ICLR 2025"},{"id":"http://arxiv.org/abs/2407.06057v2","updated":"2025-03-03T11:08:15Z","published":"2024-07-08T15:59:44Z","title":"Variational Best-of-N Alignment","summary":" Best-of-N (BoN) is a popular and effective algorithm for aligning language\nmodels to human preferences. The algorithm works as follows: at inference time,\nN samples are drawn from the language model, and the sample with the highest\nreward, as judged by a reward model, is returned as the output. Despite its\neffectiveness, BoN is computationally expensive; it reduces sampling throughput\nby a factor of N. To make BoN more efficient at inference time, one strategy is\nto fine-tune the language model to mimic what BoN does during inference. To\nachieve this, we derive the distribution induced by the BoN algorithm. We then\npropose to fine-tune the language model to minimize backward KL divergence to\nthe BoN distribution. Our approach is analogous to mean-field variational\ninference and, thus, we term it variational BoN (vBoN). To the extent this\nfine-tuning is successful and we end up with a good approximation, we have\nreduced the inference cost by a factor of N. Our experiments on controlled\ngeneration and summarization tasks show that BoN is the most effective\nalignment method, and our variational approximation to BoN achieves the closest\nperformance to BoN and surpasses models fine-tuned using the standard\nKL-constrained RL objective. In the controlled generation task, vBoN appears\nmore frequently on the Pareto frontier of reward and KL divergence compared to\nother alignment methods. In the summarization task, vBoN achieves high reward\nvalues across various sampling temperatures.\n","authors":["Afra Amini","Tim Vieira","Elliott Ash","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2407.06057v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.05841v2","updated":"2025-03-03T11:00:24Z","published":"2024-11-06T15:06:42Z","title":"FLEXtime: Filterbank learning to explain time series","summary":" State-of-the-art methods for explaining predictions from time series involve\nlearning an instance-wise saliency mask for each time step; however, many types\nof time series are difficult to interpret in the time domain, due to the\ninherently complex nature of the data. Instead, we propose to view time series\nexplainability as saliency maps over interpretable parts, leaning on\nestablished signal processing methodology on signal decomposition.\nSpecifically, we propose a new method called FLEXtime that uses a bank of\nbandpass filters to split the time series into frequency bands. Then, we learn\nthe combination of these bands that optimally explains the model's prediction.\nOur extensive evaluation shows that, on average, FLEXtime outperforms\nstate-of-the-art explainability methods across a range of datasets. FLEXtime\nfills an important gap in the current time series explainability methodology\nand is a valuable tool for a wide range of time series such as EEG and audio.\nCode will be made available at https://github.com/theabrusch/FLEXtime.\n","authors":["Thea Brüsch","Kristoffer K. Wickstrøm","Mikkel N. Schmidt","Robert Jenssen","Tommy S. Alstrøm"],"pdf_url":"https://arxiv.org/pdf/2411.05841v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.18936v4","updated":"2025-03-03T11:00:24Z","published":"2025-01-31T07:41:06Z","title":"Adaptive Prompt: Unlocking the Power of Visual Prompt Tuning","summary":" Visual Prompt Tuning (VPT) has recently emerged as a powerful method for\nadapting pre-trained vision models to downstream tasks. By introducing\nlearnable prompt tokens as task-specific instructions, VPT effectively guides\npre-trained transformer models with minimal overhead. Despite its empirical\nsuccess, a comprehensive theoretical understanding of VPT remains an active\narea of research. Building on recent insights into the connection between\nmixture of experts and prompt-based approaches, we identify a key limitation in\nVPT: the restricted functional expressiveness in prompt formulation. To address\nthis limitation, we propose Visual Adaptive Prompt Tuning (VAPT), a new\ngeneration of prompts that redefines prompts as adaptive functions of the\ninput. Our theoretical analysis shows that this simple yet intuitive approach\nachieves optimal sample efficiency. Empirical results on VTAB-1K and FGVC\nfurther demonstrate VAPT's effectiveness, with performance gains of 7.34% and\n1.04% over fully fine-tuning baselines, respectively. Notably, VAPT also\nsurpasses VPT by a substantial margin while using fewer parameters. These\nresults highlight both the effectiveness and efficiency of our method and pave\nthe way for future research to explore the potential of adaptive prompts.\n","authors":["Minh Le","Anh Nguyen","Huy Nguyen","Chau Nguyen","Nhat Ho"],"pdf_url":"https://arxiv.org/pdf/2501.18936v4.pdf","comment":"57 pages, 10 figures, 18 tables"},{"id":"http://arxiv.org/abs/2405.20579v3","updated":"2025-03-03T10:57:41Z","published":"2024-05-31T02:17:51Z","title":"HOPE: A Reinforcement Learning-based Hybrid Policy Path Planner for\n Diverse Parking Scenarios","summary":" Automated parking stands as a highly anticipated application of autonomous\ndriving technology. However, existing path planning methodologies fall short of\naddressing this need due to their incapability to handle the diverse and\ncomplex parking scenarios in reality. While non-learning methods provide\nreliable planning results, they are vulnerable to intricate occasions, whereas\nlearning-based ones are good at exploration but unstable in converging to\nfeasible solutions. To leverage the strengths of both approaches, we introduce\nHybrid pOlicy Path plannEr (HOPE). This novel solution integrates a\nreinforcement learning agent with Reeds-Shepp curves, enabling effective\nplanning across diverse scenarios. HOPE guides the exploration of the\nreinforcement learning agent by applying an action mask mechanism and employs a\ntransformer to integrate the perceived environmental information with the mask.\nTo facilitate the training and evaluation of the proposed planner, we propose a\ncriterion for categorizing the difficulty level of parking scenarios based on\nspace and obstacle distribution. Experimental results demonstrate that our\napproach outperforms typical rule-based algorithms and traditional\nreinforcement learning methods, showing higher planning success rates and\ngeneralization across various scenarios. We also conduct real-world experiments\nto verify the practicability of HOPE. The code for our solution is openly\navailable on https://github.com/jiamiya/HOPE.\n","authors":["Mingyang Jiang","Yueyuan Li","Songan Zhang","Siyuan Chen","Chunxiang Wang","Ming Yang"],"pdf_url":"https://arxiv.org/pdf/2405.20579v3.pdf","comment":"Accepted by T-ITS. 11 pages, 5 tables, 6 figures, 2 page appendix"},{"id":"http://arxiv.org/abs/2410.02423v2","updated":"2025-03-03T10:44:06Z","published":"2024-10-03T12:13:56Z","title":"PnP-Flow: Plug-and-Play Image Restoration with Flow Matching","summary":" In this paper, we introduce Plug-and-Play (PnP) Flow Matching, an algorithm\nfor solving imaging inverse problems. PnP methods leverage the strength of\npre-trained denoisers, often deep neural networks, by integrating them in\noptimization schemes. While they achieve state-of-the-art performance on\nvarious inverse problems in imaging, PnP approaches face inherent limitations\non more generative tasks like inpainting. On the other hand, generative models\nsuch as Flow Matching pushed the boundary in image sampling yet lack a clear\nmethod for efficient use in image restoration. We propose to combine the PnP\nframework with Flow Matching (FM) by defining a time-dependent denoiser using a\npre-trained FM model. Our algorithm alternates between gradient descent steps\non the data-fidelity term, reprojections onto the learned FM path, and\ndenoising. Notably, our method is computationally efficient and\nmemory-friendly, as it avoids backpropagation through ODEs and trace\ncomputations. We evaluate its performance on denoising, super-resolution,\ndeblurring, and inpainting tasks, demonstrating superior results compared to\nexisting PnP algorithms and Flow Matching based state-of-the-art methods.\n","authors":["Ségolène Martin","Anne Gagneux","Paul Hagemann","Gabriele Steidl"],"pdf_url":"https://arxiv.org/pdf/2410.02423v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.11542v2","updated":"2025-03-03T10:39:41Z","published":"2024-12-16T08:22:23Z","title":"Meta Curvature-Aware Minimization for Domain Generalization","summary":" Domain generalization (DG) aims to enhance the ability of models trained on\nsource domains to generalize effectively to unseen domains. Recently,\nSharpness-Aware Minimization (SAM) has shown promise in this area by reducing\nthe sharpness of the loss landscape to obtain more generalized models. However,\nSAM and its variants sometimes fail to guide the model toward a flat minimum,\nand their training processes exhibit limitations, hindering further\nimprovements in model generalization. In this paper, we first propose an\nimproved model training process aimed at encouraging the model to converge to a\nflat minima. To achieve this, we design a curvature metric that has a minimal\neffect when the model is far from convergence but becomes increasingly\ninfluential in indicating the curvature of the minima as the model approaches a\nlocal minimum. Then we derive a novel algorithm from this metric, called Meta\nCurvature-Aware Minimization (MeCAM), to minimize the curvature around the\nlocal minima. Specifically, the optimization objective of MeCAM simultaneously\nminimizes the regular training loss, the surrogate gap of SAM, and the\nsurrogate gap of meta-learning. We provide theoretical analysis on MeCAM's\ngeneralization error and convergence rate, and demonstrate its superiority over\nexisting DG methods through extensive experiments on five benchmark DG\ndatasets, including PACS, VLCS, OfficeHome, TerraIncognita, and DomainNet. Code\nwill be available on GitHub.\n","authors":["Ziyang Chen","Yiwen Ye","Feilong Tang","Yongsheng Pan","Yong Xia"],"pdf_url":"https://arxiv.org/pdf/2412.11542v2.pdf","comment":"22 pages, 5 figures, 17 tables"},{"id":"http://arxiv.org/abs/2502.08005v2","updated":"2025-03-03T10:38:34Z","published":"2025-02-11T23:02:14Z","title":"Towards Training One-Step Diffusion Models Without Distillation","summary":" Recent advances in one-step generative models typically follow a two-stage\nprocess: first training a teacher diffusion model and then distilling it into a\none-step student model. This distillation process traditionally relies on both\nthe teacher model's score function to compute the distillation loss and its\nweights for student initialization. In this paper, we explore whether one-step\ngenerative models can be trained directly without this distillation process.\nFirst, we show that the teacher's score function is not essential and propose a\nfamily of distillation methods that achieve competitive results without relying\non score estimation. Next, we demonstrate that initialization from teacher\nweights is indispensable in successful training. Surprisingly, we find that\nthis benefit is not due to improved ``input-output\" mapping but rather the\nlearned feature representations, which dominate distillation quality. Our\nfindings provide a better understanding of the role of initialization in\none-step model training and its impact on distillation quality.\n","authors":["Mingtian Zhang","Jiajun He","Wenlin Chen","Zijing Ou","José Miguel Hernández-Lobato","Bernhard Schölkopf","David Barber"],"pdf_url":"https://arxiv.org/pdf/2502.08005v2.pdf","comment":"13 pages, Technical Report"},{"id":"http://arxiv.org/abs/2502.15425v3","updated":"2025-03-03T10:35:14Z","published":"2025-02-21T12:52:16Z","title":"TAG: A Decentralized Framework for Multi-Agent Hierarchical\n Reinforcement Learning","summary":" Hierarchical organization is fundamental to biological systems and human\nsocieties, yet artificial intelligence systems often rely on monolithic\narchitectures that limit adaptability and scalability. Current hierarchical\nreinforcement learning (HRL) approaches typically restrict hierarchies to two\nlevels or require centralized training, which limits their practical\napplicability. We introduce TAME Agent Framework (TAG), a framework for\nconstructing fully decentralized hierarchical multi-agent systems.TAG enables\nhierarchies of arbitrary depth through a novel LevelEnv concept, which\nabstracts each hierarchy level as the environment for the agents above it. This\napproach standardizes information flow between levels while preserving loose\ncoupling, allowing for seamless integration of diverse agent types. We\ndemonstrate the effectiveness of TAG by implementing hierarchical architectures\nthat combine different RL agents across multiple levels, achieving improved\nperformance over classical multi-agent RL baselines on standard benchmarks. Our\nresults show that decentralized hierarchical organization enhances both\nlearning speed and final performance, positioning TAG as a promising direction\nfor scalable multi-agent systems.\n","authors":["Giuseppe Paolo","Abdelhakim Benechehab","Hamza Cherkaoui","Albert Thomas","Balázs Kégl"],"pdf_url":"https://arxiv.org/pdf/2502.15425v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06916v2","updated":"2025-03-03T10:22:24Z","published":"2024-11-11T12:19:28Z","title":"Slowing Down Forgetting in Continual Learning","summary":" A common challenge in continual learning (CL) is catastrophic forgetting,\nwhere the performance on old tasks drops after new, additional tasks are\nlearned. In this paper, we propose a novel framework called ReCL to slow down\nforgetting in CL. Our framework exploits an implicit bias of gradient-based\nneural networks due to which these converge to margin maximization points. Such\nconvergence points allow us to reconstruct old data from previous tasks, which\nwe then combine with the current training data. Our framework is flexible and\ncan be applied on top of existing, state-of-the-art CL methods. We further\ndemonstrate the performance gain from our framework across a large series of\nexperiments, including two challenging CL scenarios (class incremental and\ndomain incremental learning), different datasets (MNIST, CIFAR10,\nTinyImagenet), and different network architectures. Across all experiments, we\nfind large performance gains through ReCL. To the best of our knowledge, our\nframework is the first to address catastrophic forgetting by leveraging models\nin CL as their own memory buffers.\n","authors":["Pascal Janetzky","Tobias Schlagenhauf","Stefan Feuerriegel"],"pdf_url":"https://arxiv.org/pdf/2411.06916v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.21123v2","updated":"2025-03-03T10:00:03Z","published":"2025-02-28T14:57:33Z","title":"Causality Is Key to Understand and Balance Multiple Goals in Trustworthy\n ML and Foundation Models","summary":" Ensuring trustworthiness in machine learning (ML) systems is crucial as they\nbecome increasingly embedded in high-stakes domains. This paper advocates for\nintegrating causal methods into machine learning to navigate the trade-offs\namong key principles of trustworthy ML, including fairness, privacy,\nrobustness, accuracy, and explainability. While these objectives should ideally\nbe satisfied simultaneously, they are often addressed in isolation, leading to\nconflicts and suboptimal solutions. Drawing on existing applications of\ncausality in ML that successfully align goals such as fairness and accuracy or\nprivacy and robustness, this paper argues that a causal approach is essential\nfor balancing multiple competing objectives in both trustworthy ML and\nfoundation models. Beyond highlighting these trade-offs, we examine how\ncausality can be practically integrated into ML and foundation models, offering\nsolutions to enhance their reliability and interpretability. Finally, we\ndiscuss the challenges, limitations, and opportunities in adopting causal\nframeworks, paving the way for more accountable and ethically sound AI systems.\n","authors":["Ruta Binkyte","Ivaxi Sheth","Zhijing Jin","Mohammad Havaei","Bernhard Schölkopf","Mario Fritz"],"pdf_url":"https://arxiv.org/pdf/2502.21123v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02392v2","updated":"2025-03-03T09:50:18Z","published":"2024-10-03T11:13:55Z","title":"MANTRA: The Manifold Triangulations Assemblage","summary":" The rising interest in leveraging higher-order interactions present in\ncomplex systems has led to a surge in more expressive models exploiting\nhigher-order structures in the data, especially in topological deep learning\n(TDL), which designs neural networks on higher-order domains such as simplicial\ncomplexes. However, progress in this field is hindered by the scarcity of\ndatasets for benchmarking these architectures. To address this gap, we\nintroduce MANTRA, the first large-scale, diverse, and intrinsically\nhigher-order dataset for benchmarking higher-order models, comprising over\n43,000 and 250,000 triangulations of surfaces and three-dimensional manifolds,\nrespectively. With MANTRA, we assess several graph- and simplicial\ncomplex-based models on three topological classification tasks. We demonstrate\nthat while simplicial complex-based neural networks generally outperform their\ngraph-based counterparts in capturing simple topological invariants, they also\nstruggle, suggesting a rethink of TDL. Thus, MANTRA serves as a benchmark for\nassessing and advancing topological methods, leading the way for more effective\nhigher-order models.\n","authors":["Rubén Ballester","Ernst Röell","Daniel Bīn Schmid","Mathieu Alain","Sergio Escalera","Carles Casacuberta","Bastian Rieck"],"pdf_url":"https://arxiv.org/pdf/2410.02392v2.pdf","comment":"Accepted at ICLR 2025 (https://openreview.net/forum?id=X6y5CC44HM)"},{"id":"http://arxiv.org/abs/2402.09154v2","updated":"2025-03-03T09:37:27Z","published":"2024-02-14T13:13:26Z","title":"Attacking Large Language Models with Projected Gradient Descent","summary":" Current LLM alignment methods are readily broken through specifically crafted\nadversarial prompts. While crafting adversarial prompts using discrete\noptimization is highly effective, such attacks typically use more than 100,000\nLLM calls. This high computational cost makes them unsuitable for, e.g.,\nquantitative analyses and adversarial training. To remedy this, we revisit\nProjected Gradient Descent (PGD) on the continuously relaxed input prompt.\nAlthough previous attempts with ordinary gradient-based attacks largely failed,\nwe show that carefully controlling the error introduced by the continuous\nrelaxation tremendously boosts their efficacy. Our PGD for LLMs is up to one\norder of magnitude faster than state-of-the-art discrete optimization to\nachieve the same devastating attack results.\n","authors":["Simon Geisler","Tom Wollschläger","M. H. I. Abdalla","Johannes Gasteiger","Stephan Günnemann"],"pdf_url":"https://arxiv.org/pdf/2402.09154v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23751v2","updated":"2025-03-03T09:30:42Z","published":"2024-10-31T09:11:56Z","title":"EXACFS -- A CIL Method to mitigate Catastrophic Forgetting","summary":" Deep neural networks (DNNS) excel at learning from static datasets but\nstruggle with continual learning, where data arrives sequentially. Catastrophic\nforgetting, the phenomenon of forgetting previously learned knowledge, is a\nprimary challenge. This paper introduces EXponentially Averaged Class-wise\nFeature Significance (EXACFS) to mitigate this issue in the class incremental\nlearning (CIL) setting. By estimating the significance of model features for\neach learned class using loss gradients, gradually aging the significance\nthrough the incremental tasks and preserving the significant features through a\ndistillation loss, EXACFS effectively balances remembering old knowledge\n(stability) and learning new knowledge (plasticity). Extensive experiments on\nCIFAR-100 and ImageNet-100 demonstrate EXACFS's superior performance in\npreserving stability while acquiring plasticity.\n","authors":["S Balasubramanian","M Sai Subramaniam","Sai Sriram Talasu","Yedu Krishna P","Manepalli Pranav Phanindra Sai","Ravi Mukkamala","Darshan Gera"],"pdf_url":"https://arxiv.org/pdf/2410.23751v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.00537v2","updated":"2025-03-03T09:26:05Z","published":"2024-11-30T17:05:12Z","title":"Exact Certification of (Graph) Neural Networks Against Label Poisoning","summary":" Machine learning models are highly vulnerable to label flipping, i.e., the\nadversarial modification (poisoning) of training labels to compromise\nperformance. Thus, deriving robustness certificates is important to guarantee\nthat test predictions remain unaffected and to understand worst-case robustness\nbehavior. However, for Graph Neural Networks (GNNs), the problem of certifying\nlabel flipping has so far been unsolved. We change this by introducing an exact\ncertification method, deriving both sample-wise and collective certificates.\nOur method leverages the Neural Tangent Kernel (NTK) to capture the training\ndynamics of wide networks enabling us to reformulate the bilevel optimization\nproblem representing label flipping into a Mixed-Integer Linear Program (MILP).\nWe apply our method to certify a broad range of GNN architectures in node\nclassification tasks. Thereby, concerning the worst-case robustness to label\nflipping: $(i)$ we establish hierarchies of GNNs on different benchmark graphs;\n$(ii)$ quantify the effect of architectural choices such as activations, depth\nand skip-connections; and surprisingly, $(iii)$ uncover a novel phenomenon of\nthe robustness plateauing for intermediate perturbation budgets across all\ninvestigated datasets and architectures. While we focus on GNNs, our\ncertificates are applicable to sufficiently wide NNs in general through their\nNTK. Thus, our work presents the first exact certificate to a poisoning attack\never derived for neural networks, which could be of independent interest. The\ncode is available at https://github.com/saper0/qpcert.\n","authors":["Mahalakshmi Sabanayagam","Lukas Gosch","Stephan Günnemann","Debarghya Ghoshdastidar"],"pdf_url":"https://arxiv.org/pdf/2412.00537v2.pdf","comment":"Published as a spotlight presentation at ICLR 2025"},{"id":"http://arxiv.org/abs/2502.16890v2","updated":"2025-03-03T08:58:48Z","published":"2025-02-24T06:40:33Z","title":"ReFocus: Reinforcing Mid-Frequency and Key-Frequency Modeling for\n Multivariate Time Series Forecasting","summary":" Recent advancements have progressively incorporated frequency-based\ntechniques into deep learning models, leading to notable improvements in\naccuracy and efficiency for time series analysis tasks. However, the\nMid-Frequency Spectrum Gap in the real-world time series, where the energy is\nconcentrated at the low-frequency region while the middle-frequency band is\nnegligible, hinders the ability of existing deep learning models to extract the\ncrucial frequency information. Additionally, the shared Key-Frequency in\nmultivariate time series, where different time series share indistinguishable\nfrequency patterns, is rarely exploited by existing literature. This work\nintroduces a novel module, Adaptive Mid-Frequency Energy Optimizer, based on\nconvolution and residual learning, to emphasize the significance of\nmid-frequency bands. We also propose an Energy-based Key-Frequency Picking\nBlock to capture shared Key-Frequency, which achieves superior inter-series\nmodeling performance with fewer parameters. A novel Key-Frequency Enhanced\nTraining strategy is employed to further enhance Key-Frequency modeling, where\nspectral information from other channels is randomly introduced into each\nchannel. Our approach advanced multivariate time series forecasting on the\nchallenging Traffic, ECL, and Solar benchmarks, reducing MSE by 4%, 6%, and 5%\ncompared to the previous SOTA iTransformer. Code is available at this GitHub\nRepository: https://github.com/Levi-Ackman/ReFocus.\n","authors":["Guoqi Yu","Yaoming Li","Juncheng Wang","Xiaoyu Guo","Angelica I. Aviles-Rivero","Tong Yang","Shujun Wang"],"pdf_url":"https://arxiv.org/pdf/2502.16890v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2502.08679v3","updated":"2025-03-03T08:50:28Z","published":"2025-02-12T08:56:35Z","title":"Deep Learning-Driven Malware Classification with API Call Sequence\n Analysis and Concept Drift Handling","summary":" Malware classification in dynamic environments presents a significant\nchallenge due to concept drift, where the statistical properties of malware\ndata evolve over time, complicating detection efforts. To address this issue,\nwe propose a deep learning framework enhanced with a genetic algorithm to\nimprove malware classification accuracy and adaptability. Our approach\nincorporates mutation operations and fitness score evaluations within genetic\nalgorithms to continuously refine the deep learning model, ensuring robustness\nagainst evolving malware threats. Experimental results demonstrate that this\nhybrid method significantly enhances classification performance and\nadaptability, outperforming traditional static models. Our proposed approach\noffers a promising solution for real-time malware classification in\never-changing cybersecurity landscapes.\n","authors":["Bishwajit Prasad Gond","Durga Prasad Mohapatra"],"pdf_url":"https://arxiv.org/pdf/2502.08679v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03856v4","updated":"2025-03-03T08:48:38Z","published":"2024-07-04T11:42:36Z","title":"Q-Adapter: Customizing Pre-trained LLMs to New Preferences with\n Forgetting Mitigation","summary":" Large Language Models (LLMs), trained on a large amount of corpus, have\ndemonstrated remarkable abilities. However, it may not be sufficient to\ndirectly apply open-source LLMs like Llama to certain real-world scenarios,\nsince most of them are trained for \\emph{general} purposes. Thus, the demands\nfor customizing publicly available LLMs emerge, but are currently\nunder-studied. In this work, we consider customizing pre-trained LLMs with new\nhuman preferences. Specifically, the LLM should not only meet the new\npreference but also preserve its original capabilities after customization.\nDrawing inspiration from the observation that human preference can be expressed\nas a reward model, we propose to cast LLM customization as optimizing the sum\nof two reward functions, one of which (denoted as $r_1$) was used to pre-train\nthe LLM while the other (denoted as $r_2$) characterizes the new human\npreference. The obstacle here is that both reward functions are unknown, making\nthe application of modern reinforcement learning methods infeasible. Thanks to\nthe residual Q-learning framework, we can restore the customized LLM with the\npre-trained LLM and the \\emph{residual Q-function} without the reward function\n$r_1$. Moreover, we find that for a fixed pre-trained LLM, the reward function\n$r_2$ can be derived from the residual Q-function, enabling us to directly\nlearn the residual Q-function from the new human preference data upon the\nBradley-Terry model. We name our method Q-Adapter as it introduces an adapter\nmodule to approximate the residual Q-function for customizing the pre-trained\nLLM towards the new preference. Experiments based on the Llama-3.1 model on the\nDSP dataset and HH-RLHF dataset illustrate the superior effectiveness of\nQ-Adapter on both retaining existing knowledge and learning new preferences.\nCode is available at https://github.com/mansicer/Q-Adapter.\n","authors":["Yi-Chen Li","Fuxiang Zhang","Wenjie Qiu","Lei Yuan","Chengxing Jia","Zongzhang Zhang","Yang Yu","Bo An"],"pdf_url":"https://arxiv.org/pdf/2407.03856v4.pdf","comment":"Camera ready version of ICLR 2025"},{"id":"http://arxiv.org/abs/2410.07267v2","updated":"2025-03-03T08:45:31Z","published":"2024-10-09T02:44:53Z","title":"Scintillation pulse characterization with spectrum-inspired temporal\n neural networks: case studies on particle detector signals","summary":" Particle detectors based on scintillators are widely used in high-energy\nphysics and astroparticle physics experiments, nuclear medicine imaging,\nindustrial and environmental detection, etc. Precisely extracting scintillation\nsignal characteristics at the event level is important for these applications,\nnot only in respect of understanding the scintillator itself, but also kinds\nand physical property of incident particles. Recent researches demonstrate\ndata-driven neural networks surpass traditional statistical methods, especially\nwhen the analytical form of signals is hard to obtain, or noise is significant.\nHowever, most densely connected or convolution-based networks fail to fully\nexploit the spectral and temporal structure of scintillation signals, leaving\nlarge space for performance improvement. In this paper, we propose a network\narchitecture specially tailored for scintillation pulse characterization based\non previous works on time series analysis. The core insight is that, by\ndirectly applying Fast Fourier Transform on original signals and utilizing\ndifferent frequency components, the proposed network architecture can serve as\na lightweight and enhanced representation learning backbone. We prove our idea\nin two case studies: (a) simulation data generated with the setting of the LUX\ndark matter detector, and (b) experimental electrical signals with fast\nelectronics to emulate scintillation variations for the NICA/MPD calorimeter.\nThe proposed model achieves significantly better results than the reference\nmodel in literature and densely connected models, and demonstrates higher\ncost-efficiency than conventional machine learning methods.\n","authors":["Pengcheng Ai","Xiangming Sun","Zhi Deng","Xinchi Ran"],"pdf_url":"https://arxiv.org/pdf/2410.07267v2.pdf","comment":"29 pages, 14 figures"},{"id":"http://arxiv.org/abs/2502.11167v2","updated":"2025-03-03T08:26:12Z","published":"2025-02-16T15:38:19Z","title":"SURGE: On the Potential of Large Language Models as General-Purpose\n Surrogate Code Executors","summary":" Neural surrogate models have emerged as powerful and efficient tools in data\nmining. Meanwhile, large language models (LLMs) have demonstrated remarkable\ncapabilities in code-related tasks. We investigate a novel application: using\nLLMs as surrogate models for code execution prediction. Given LLMs' unique\nability to understand and process diverse programs, they present a promising\ndirection for building general-purpose surrogate models. To systematically\ninvestigate this capability, we introduce SURGE, a comprehensive benchmark with\n$1160$ problems covering $8$ key aspects: multi-language programming tasks,\ncompetition-level programming problems, repository-level code analysis,\nhigh-cost scientific computing, time-complexity-intensive algorithms, buggy\ncode analysis, programs dependent on specific compilers or execution\nenvironments, and formal mathematical proof verification. Through extensive\nempirical analysis of $21$ open-source and proprietary LLMs, we examine scaling\nlaws, data efficiency, and predictive accuracy. Our findings reveal important\ninsights about the feasibility of LLMs as efficient surrogates for\ncomputational processes, with implications for automated software testing,\nprogram analysis, and computational resource optimization in data mining\napplications. Code and dataset are released at\nhttps://github.com/Imbernoulli/SURGE.\n","authors":["Bohan Lyu","Siqiao Huang","Zichen Liang"],"pdf_url":"https://arxiv.org/pdf/2502.11167v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19316v2","updated":"2025-03-03T08:22:25Z","published":"2024-05-29T17:39:48Z","title":"Robust Preference Optimization through Reward Model Distillation","summary":" Language model (LM) post-training (or alignment) involves maximizing a reward\nfunction that is derived from preference annotations. Direct Preference\nOptimization (DPO) is a popular offline alignment method that trains a policy\ndirectly on preference data without the need to train a reward model or apply\nreinforcement learning. However, the empirical evidence suggests that DPO\ntypically assigns implicit rewards that overfit, and trend towards infinite\nmagnitude. This frequently leads to degenerate policies, sometimes causing even\nthe probabilities of the preferred generations to go to zero. In this work, we\nanalyze this phenomenon and use distillation to get a better proxy for the true\npreference distribution over generation pairs: we train the LM such that its\ninduced implicit reward, i.e., the scaled log-likelihood ratio of the model to\nthe reference model, matches an explicit reward model trained on the preference\ndata. Moreover, to account for uncertainty in the reward model we are\ndistilling from, we optimize against a family of reward models that, as a\nwhole, is likely to include at least one reasonable proxy for the preference\ndistribution. Our results show that distilling from such a family of reward\nmodels leads to improved robustness to distribution shift in preference\nannotations, while preserving the simple supervised nature of DPO.\n","authors":["Adam Fisch","Jacob Eisenstein","Vicky Zayats","Alekh Agarwal","Ahmad Beirami","Chirag Nagpal","Pete Shaw","Jonathan Berant"],"pdf_url":"https://arxiv.org/pdf/2405.19316v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.07407v2","updated":"2025-03-03T08:05:53Z","published":"2024-12-10T10:58:47Z","title":"Towards Graph Foundation Models: A Study on the Generalization of\n Positional and Structural Encodings","summary":" Recent advances in integrating positional and structural encodings (PSEs)\ninto graph neural networks (GNNs) have significantly enhanced their performance\nacross various graph learning tasks. However, the general applicability of\nthese encodings and their potential to serve as foundational representations\nfor graphs remain uncertain. This paper investigates the fine-tuning\nefficiency, scalability with sample size, and generalization capability of\nlearnable PSEs across diverse graph datasets. Specifically, we evaluate their\npotential as universal pre-trained models that can be easily adapted to new\ntasks with minimal fine-tuning and limited data. Furthermore, we assess the\nexpressivity of the learned representations, particularly, when used to augment\ndownstream GNNs. We demonstrate through extensive benchmarking and empirical\nanalysis that PSEs generally enhance downstream models. However, some datasets\nmay require specific PSE-augmentations to achieve optimal performance.\nNevertheless, our findings highlight their significant potential to become\nintegral components of future graph foundation models. We provide new insights\ninto the strengths and limitations of PSEs, contributing to the broader\ndiscourse on foundation models in graph learning.\n","authors":["Billy Joe Franks","Moshe Eliasof","Semih Cantürk","Guy Wolf","Carola-Bibiane Schönlieb","Sophie Fellenz","Marius Kloft"],"pdf_url":"https://arxiv.org/pdf/2412.07407v2.pdf","comment":"Published at TMLR (https://openreview.net/forum?id=mSoDRZXsqj)"},{"id":"http://arxiv.org/abs/2410.02683v2","updated":"2025-03-03T07:20:54Z","published":"2024-10-03T17:08:52Z","title":"DailyDilemmas: Revealing Value Preferences of LLMs with Quandaries of\n Daily Life","summary":" As users increasingly seek guidance from LLMs for decision-making in daily\nlife, many of these decisions are not clear-cut and depend significantly on the\npersonal values and ethical standards of people. We present DailyDilemmas, a\ndataset of 1,360 moral dilemmas encountered in everyday life. Each dilemma\npresents two possible actions, along with affected parties and relevant human\nvalues for each action. Based on these dilemmas, we gather a repository of\nhuman values covering diverse everyday topics, such as interpersonal\nrelationships, workplace, and environmental issues. With DailyDilemmas, we\nevaluate LLMs on these dilemmas to determine what action they will choose and\nthe values represented by these action choices. Then, we analyze values through\nthe lens of five theoretical frameworks inspired by sociology, psychology, and\nphilosophy, including the World Values Survey, Moral Foundations Theory,\nMaslow's Hierarchy of Needs, Aristotle's Virtues, and Plutchik's Wheel of\nEmotions. For instance, we find LLMs are most aligned with self-expression over\nsurvival in World Values Survey and care over loyalty in Moral Foundations\nTheory. Interestingly, we find substantial preference differences in models for\nsome core values. For example, for truthfulness, Mixtral-8x7B neglects it by\n9.7% while GPT-4-turbo selects it by 9.4%. We also study the recent guidance\nreleased by OpenAI (ModelSpec), and Anthropic (Constitutional AI) to understand\nhow their designated principles reflect their models' actual value\nprioritization when facing nuanced moral reasoning in daily-life settings.\nFinally, we find that end users cannot effectively steer such prioritization\nusing system prompts.\n","authors":["Yu Ying Chiu","Liwei Jiang","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2410.02683v2.pdf","comment":"Accepted into ICLR 2025 (spotlight)"},{"id":"http://arxiv.org/abs/2501.02497v2","updated":"2025-03-03T07:16:16Z","published":"2025-01-05T10:24:20Z","title":"Test-Time Compute: from System-1 Thinking to System-2 Thinking","summary":" The remarkable performance of the o1 model in complex reasoning demonstrates\nthat test-time compute scaling can further unlock the model's potential,\nenabling powerful System-2 thinking. However, there is still a lack of\ncomprehensive surveys for test-time compute scaling. We trace the concept of\ntest-time compute back to System-1 models. In System-1 models, test-time\ncompute addresses distribution shifts and improves robustness and\ngeneralization through parameter updating, input modification, representation\nediting, and output calibration. In System-2 models, it enhances the model's\nreasoning ability to solve complex problems through repeated sampling,\nself-correction, and tree search. We organize this survey according to the\ntrend of System-1 to System-2 thinking, highlighting the key role of test-time\ncompute in the transition from System-1 models to weak System-2 models, and\nthen to strong System-2 models. We also point out a few possible future\ndirections.\n","authors":["Yixin Ji","Juntao Li","Hai Ye","Kaixin Wu","Kai Yao","Jia Xu","Linjian Mo","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.02497v2.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2403.03636v3","updated":"2025-03-03T06:56:29Z","published":"2024-03-06T11:48:08Z","title":"SheetAgent: Towards A Generalist Agent for Spreadsheet Reasoning and\n Manipulation via Large Language Models","summary":" Spreadsheets are ubiquitous across the World Wide Web, playing a critical\nrole in enhancing work efficiency across various domains. Large language model\n(LLM) has been recently attempted for automatic spreadsheet manipulation but\nhas not yet been investigated in complicated and realistic tasks where\nreasoning challenges exist (e.g., long horizon manipulation with multi-step\nreasoning and ambiguous requirements). To bridge the gap with the real-world\nrequirements, we introduce SheetRM, a benchmark featuring long-horizon and\nmulti-category tasks with reasoning-dependent manipulation caused by real-life\nchallenges. To mitigate the above challenges, we further propose SheetAgent, a\nnovel autonomous agent that utilizes the power of LLMs. SheetAgent consists of\nthree collaborative modules: Planner, Informer, and Retriever, achieving both\nadvanced reasoning and accurate manipulation over spreadsheets without human\ninteraction through iterative task reasoning and reflection. Extensive\nexperiments demonstrate that SheetAgent delivers 20--40\\% pass rate\nimprovements on multiple benchmarks over baselines, achieving enhanced\nprecision in spreadsheet manipulation and demonstrating superior table\nreasoning abilities. More details and visualizations are available at the\nproject website: https://sheetagent.github.io/. The datasets and source code\nare available at https://anonymous.4open.science/r/SheetAgent.\n","authors":["Yibin Chen","Yifu Yuan","Zeyu Zhang","Yan Zheng","Jinyi Liu","Fei Ni","Jianye Hao","Hangyu Mao","Fuzheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.03636v3.pdf","comment":"Accepted by International World Wide Web Conference (WWW) 2025 (oral)"},{"id":"http://arxiv.org/abs/2407.04752v2","updated":"2025-03-03T06:46:33Z","published":"2024-07-05T08:37:17Z","title":"SpikeLLM: Scaling up Spiking Neural Network to Large Language Models via\n Saliency-based Spiking","summary":" Recent advancements in large language models (LLMs) with billions of\nparameters have improved performance in various applications, but their\ninference processes demand significant energy and computational resources. In\ncontrast, the human brain, with approximately 86 billion neurons, is much more\nenergy-efficient than LLMs with similar parameters. Inspired by this, we\nredesign 7$\\sim$70 billion parameter LLMs using bio-plausible spiking\nmechanisms, emulating the efficient behavior of the human brain. We propose the\nfirst spiking large language model, SpikeLLM. Coupled with the proposed model,\ntwo essential approaches are proposed to improve spike training efficiency:\nGeneralized Integrate-and-Fire (GIF) neurons to compress spike length from $T$\nto $\\frac{T}{L} \\log_2 L$ bits, and an Optimal Brain Spiking framework to\ndivide outlier channels and allocate different $T$ for GIF neurons, which\nfurther compresses spike length to approximate $log_2T$ bits. The necessity of\nspike-driven LLM is proved by comparison with quantized LLMs with similar\noperations. In the OmniQuant pipeline, SpikeLLM reduces 11.01% WikiText2\nperplexity and improves 2.55% accuracy of common scene reasoning on a LLAMA-7B\nW4A4 model. In the GPTQ pipeline, SpikeLLM achieves direct additive in linear\nlayers, significantly exceeding PB-LLMs.\n","authors":["Xingrun Xing","Boyan Gao","Zheng Zhang","David A. Clifton","Shitao Xiao","Li Du","Guoqi Li","Jiajun Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.04752v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.12949v2","updated":"2025-03-03T06:42:17Z","published":"2025-02-18T16:00:10Z","title":"Efficient Learning Under Density Shift in Incremental Settings Using\n Cramér-Rao-Based Regularization","summary":" The continuous surge in data volume and velocity is often dealt with using\ndata orchestration and distributed processing approaches, abstracting away the\nmachine learning challenges that exist at the algorithmic level. With growing\ninterest in automating the learning loop, training with data that arrive in a\nsequence rather than in the classical in-memory training data form will face a\nmachine learning challenge because of evolving feature distributions across\nbatches of training data biasing the cross-validation step\n(\\cite{sugiyama2012machine}). This work takes a distributed density estimation\nangle to the problem where data are temporally distributed. It processes data\nin batches and allows a neural network to treat a batch as training data. The\nmethod accumulates knowledge about the data density via posterior probability\nabsorption using the Fisher Information Matrix, which contains information\nabout the local optimization gradients for the batch. This is then used as a\nregularizer for the loss in the following batch, and therefore the density\nestimate for the entire dataset constructively gets more robust to the non-iid\ndistribution shift. This needs the presence of a pair of batches in memory at a\ntime, so the space cost is not a function of the size of the complete,\ndistributed dataset. We proposed a novel regularization-based approach\nCovariate Shift Correction $C^{2}A$ that leverages Fisher information and\nKullback-Leibler divergence to adapt to both natural and sequential covariate\nshift caused by dataset fragmentation. $C^{2}A$ achieves $19\\%$ accuracy at\nmaximum against state-of-the-art methods.\n","authors":["Behraj Khan","Behroz Mirza","Nouman Durrani","Tahir Syed"],"pdf_url":"https://arxiv.org/pdf/2502.12949v2.pdf","comment":"It is the older version of our this paper arXiv:2502.15756. So this\n is the duplicate older version mistakenly uploaded. There are mistakes in the\n method part of this paper"},{"id":"http://arxiv.org/abs/2412.15598v2","updated":"2025-03-03T06:39:17Z","published":"2024-12-20T06:42:58Z","title":"Long-Term EEG Partitioning for Seizure Onset Detection","summary":" Deep learning models have recently shown great success in classifying\nepileptic patients using EEG recordings. Unfortunately, classification-based\nmethods lack a sound mechanism to detect the onset of seizure events. In this\nwork, we propose a two-stage framework, SODor, that explicitly models seizure\nonset through a novel task formulation of subsequence clustering. Given an EEG\nsequence, the framework first learns a set of second-level embeddings with\nlabel supervision. It then employs model-based clustering to explicitly capture\nlong-term temporal dependencies in EEG sequences and identify meaningful\nsubsequences. Epochs within a subsequence share a common cluster assignment\n(normal or seizure), with cluster or state transitions representing successful\nonset detections. Extensive experiments on three datasets demonstrate that our\nmethod can correct misclassifications, achieving 5\\%-11\\% classification\nimprovements over other baselines and accurately detecting seizure onsets.\n","authors":["Zheng Chen","Yasuko Matsubara","Yasushi Sakurai","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2412.15598v2.pdf","comment":"Accepted at AAAI 2025"},{"id":"http://arxiv.org/abs/2309.15531v3","updated":"2025-03-03T06:37:01Z","published":"2023-09-27T09:48:31Z","title":"Rethinking Channel Dimensions to Isolate Outliers for Low-bit Weight\n Quantization of Large Language Models","summary":" Large Language Models (LLMs) have recently demonstrated remarkable success\nacross various tasks. However, efficiently serving LLMs has been a challenge\ndue to the large memory bottleneck, specifically in small batch inference\nsettings (e.g. mobile devices). Weight-only quantization can be a promising\napproach, but sub-4 bit quantization remains a challenge due to large-magnitude\nactivation outliers. To mitigate the undesirable outlier effect, we first\npropose per-IC quantization, a simple yet effective method that creates\nquantization groups within each input channel (IC) rather than the conventional\nper-output-channel (per-OC). Our method is motivated by the observation that\nactivation outliers affect the input dimension of the weight matrix, so\nsimilarly grouping the weights in the IC direction can isolate outliers within\na group. We also find that activation outliers do not dictate quantization\ndifficulty, and inherent weight sensitivities also exist. With per-IC\nquantization as a new outlier-friendly scheme, we propose Adaptive Dimensions\n(AdaDim), a versatile quantization framework that can adapt to various weight\nsensitivity patterns. We demonstrate the effectiveness of AdaDim by augmenting\nprior methods such as Round-To-Nearest and GPTQ, showing significant\nimprovements across various language modeling benchmarks for both base (up to\n+4.7% on MMLU) and instruction-tuned (up to +10% on HumanEval) LLMs. Code is\navailable at https://github.com/johnheo/adadim-llm\n","authors":["Jung Hwan Heo","Jeonghoon Kim","Beomseok Kwon","Byeongwook Kim","Se Jung Kwon","Dongsoo Lee"],"pdf_url":"https://arxiv.org/pdf/2309.15531v3.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2412.19160v2","updated":"2025-03-03T06:34:25Z","published":"2024-12-26T10:40:15Z","title":"Cross-Spectral Vision Transformer for Biometric Authentication using\n Forehead Subcutaneous Vein Pattern and Periocular Pattern","summary":" Traditional biometric systems have encountered significant setbacks due to\nvarious unavoidable factors, for example, face recognition-based biometrics\nfails due to the wearing of face masks and fingerprints create hygiene\nconcerns. This paper proposes a novel lightweight cross-spectral vision\ntransformer (CS-ViT) for biometric authentication using forehead subcutaneous\nvein patterns and periocular patterns, offering a promising alternative to\ntraditional methods, capable of performing well even with the face masks and\nwithout any physical touch. The proposed framework comprises a cross-spectral\ndual-channel architecture designed to handle two distinct biometric traits and\nto capture inter-dependencies in terms of relative spectral patterns. Each\nchannel consists of a Phase-Only Correlation Cross-Spectral Attention (POC-CSA)\nthat captures their individual as well as correlated patterns. The computation\nof cross-spectral attention using POC extracts the phase correlation in the\nspatial features. Therefore, it is robust against the resolution/intensity\nvariations and illumination of the input images, assuming both biometric traits\nare from the same person. The lightweight model is suitable for edge device\ndeployment. The performance of the proposed algorithm was rigorously evaluated\nusing the Forehead Subcutaneous Vein Pattern and Periocular Biometric Pattern\n(FSVP-PBP) database. The results demonstrated the superiority of the algorithm\nover state-of-the-art methods, achieving a remarkable classification accuracy\nof 98.8% with the combined vein and periocular patterns.\n","authors":["Arun K. Sharma","Shubhobrata Bhattacharya","Motahar Reza","Bishakh Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2412.19160v2.pdf","comment":"Submitted to IEEE TPAMI"},{"id":"http://arxiv.org/abs/2410.01746v2","updated":"2025-03-03T06:17:54Z","published":"2024-10-02T17:01:01Z","title":"Leray-Schauder Mappings for Operator Learning","summary":" We present an algorithm for learning operators between Banach spaces, based\non the use of Leray-Schauder mappings to learn a finite-dimensional\napproximation of compact subspaces. We show that the resulting method is a\nuniversal approximator of (possibly nonlinear) operators. We demonstrate the\nefficiency of the approach on two benchmark datasets showing it achieves\nresults comparable to state of the art models.\n","authors":["Emanuele Zappala"],"pdf_url":"https://arxiv.org/pdf/2410.01746v2.pdf","comment":"13 pages, 2 figures, 1 table. Comments are welcome! v2: Theoretical\n analysis expanded, several explanations regarding the experiments have been\n added for improved clarity"},{"id":"http://arxiv.org/abs/2310.01405v4","updated":"2025-03-03T06:14:14Z","published":"2023-10-02T17:59:07Z","title":"Representation Engineering: A Top-Down Approach to AI Transparency","summary":" In this paper, we identify and characterize the emerging area of\nrepresentation engineering (RepE), an approach to enhancing the transparency of\nAI systems that draws on insights from cognitive neuroscience. RepE places\npopulation-level representations, rather than neurons or circuits, at the\ncenter of analysis, equipping us with novel methods for monitoring and\nmanipulating high-level cognitive phenomena in deep neural networks (DNNs). We\nprovide baselines and an initial analysis of RepE techniques, showing that they\noffer simple yet effective solutions for improving our understanding and\ncontrol of large language models. We showcase how these methods can provide\ntraction on a wide range of safety-relevant problems, including honesty,\nharmlessness, power-seeking, and more, demonstrating the promise of top-down\ntransparency research. We hope that this work catalyzes further exploration of\nRepE and fosters advancements in the transparency and safety of AI systems.\n","authors":["Andy Zou","Long Phan","Sarah Chen","James Campbell","Phillip Guo","Richard Ren","Alexander Pan","Xuwang Yin","Mantas Mazeika","Ann-Kathrin Dombrowski","Shashwat Goel","Nathaniel Li","Michael J. Byun","Zifan Wang","Alex Mallen","Steven Basart","Sanmi Koyejo","Dawn Song","Matt Fredrikson","J. Zico Kolter","Dan Hendrycks"],"pdf_url":"https://arxiv.org/pdf/2310.01405v4.pdf","comment":"Code is available at\n https://github.com/andyzoujm/representation-engineering"},{"id":"http://arxiv.org/abs/2411.02886v2","updated":"2025-03-03T05:49:41Z","published":"2024-11-05T07:56:24Z","title":"TokenSelect: Efficient Long-Context Inference and Length Extrapolation\n for LLMs via Dynamic Token-Level KV Cache Selection","summary":" The rapid advancement of Large Language Models (LLMs) has driven growing\ndemand for processing extended context sequences in contemporary applications.\nHowever, this progress faces two major challenges: performance degradation due\nto sequence lengths out-of-distribution, and excessively long inference times\ncaused by the quadratic computational complexity of attention. These issues\nhinder the application of LLMs in long-context scenarios. In this paper, we\npropose Dynamic Token-Level KV Cache Selection (TokenSelect), a training-free\nmethod for efficient and accurate long-context inference. TokenSelect builds\nupon the observation of non-contiguous attention sparsity, using Query-Key dot\nproducts to measure per-head KV Cache criticality at token-level. By per-head\nsoft voting mechanism, TokenSelect selectively involves a few critical KV cache\ntokens in attention calculation without sacrificing accuracy. To further\naccelerate TokenSelect, we design the Selection Cache based on observations of\nconsecutive Query similarity and implemented efficient dot product kernel,\nsignificantly reducing the overhead. A comprehensive evaluation of TokenSelect\ndemonstrates up to 23.84x speedup in attention computation and up to 2.28x\nacceleration in end-to-end latency, while providing superior performance\ncompared to state-of-the-art long-context inference methods.\n","authors":["Wei Wu","Zhuoshi Pan","Chao Wang","Liyi Chen","Yunchu Bai","Tianfu Wang","Kun Fu","Zheng Wang","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2411.02886v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04495v4","updated":"2025-03-03T05:38:10Z","published":"2024-07-05T13:35:14Z","title":"Speed-accuracy relations for the diffusion models: Wisdom from\n nonequilibrium thermodynamics and optimal transport","summary":" We discuss a connection between a generative model, called the diffusion\nmodel, and nonequilibrium thermodynamics for the Fokker-Planck equation, called\nstochastic thermodynamics. Based on the techniques of stochastic\nthermodynamics, we derive the speed-accuracy relations for the diffusion\nmodels, which are inequalities that relate the accuracy of data generation to\nthe entropy production rate, which can be interpreted as the speed of the\ndiffusion dynamics in the absence of the non-conservative force. From a\nstochastic thermodynamic perspective, our results provide a quantitative\ninsight into how best to generate data in diffusion models. The optimal\nlearning protocol is introduced by the geodesic of space of the 2-Wasserstein\ndistance in optimal transport theory. We numerically illustrate the validity of\nthe speed-accuracy relations for the diffusion models with different noise\nschedules and the different data. We numerically discuss our results for the\noptimal and suboptimal learning protocols. We also show the inaccurate data\ngeneration due to the non-conservative force, and the applicability of our\nresults to data generation from the real-world image datasets.\n","authors":["Kotaro Ikeda","Tomoya Uda","Daisuke Okanohara","Sosuke Ito"],"pdf_url":"https://arxiv.org/pdf/2407.04495v4.pdf","comment":"36 pages, 7 figures"},{"id":"http://arxiv.org/abs/2410.02268v3","updated":"2025-03-03T05:32:47Z","published":"2024-10-03T07:40:14Z","title":"Structural-Entropy-Based Sample Selection for Efficient and Effective\n Learning","summary":" Sample selection improves the efficiency and effectiveness of machine\nlearning models by providing informative and representative samples. Typically,\nsamples can be modeled as a sample graph, where nodes are samples and edges\nrepresent their similarities. Most existing methods are based on local\ninformation, such as the training difficulty of samples, thereby overlooking\nglobal information, such as connectivity patterns. This oversight can result in\nsuboptimal selection because global information is crucial for ensuring that\nthe selected samples well represent the structural properties of the graph. To\naddress this issue, we employ structural entropy to quantify global information\nand losslessly decompose it from the whole graph to individual nodes using the\nShapley value. Based on the decomposition, we present\n$\\textbf{S}$tructural-$\\textbf{E}$ntropy-based sample $\\textbf{S}$election\n($\\textbf{SES}$), a method that integrates both global and local information to\nselect informative and representative samples. SES begins by constructing a\n$k$NN-graph among samples based on their similarities. It then measures sample\nimportance by combining structural entropy (global metric) with training\ndifficulty (local metric). Finally, SES applies importance-biased blue noise\nsampling to select a set of diverse and representative samples. Comprehensive\nexperiments on three learning scenarios -- supervised learning, active\nlearning, and continual learning -- clearly demonstrate the effectiveness of\nour method.\n","authors":["Tianchi Xie","Jiangning Zhu","Guozu Ma","Minzhi Lin","Wei Chen","Weikai Yang","Shixia Liu"],"pdf_url":"https://arxiv.org/pdf/2410.02268v3.pdf","comment":"Published as a conference paper at ICLR 2025"},{"id":"http://arxiv.org/abs/2502.01912v2","updated":"2025-03-03T05:25:43Z","published":"2025-02-04T01:05:12Z","title":"PATCH: a deep learning method to assess heterogeneity of artistic\n practice in historical paintings","summary":" The history of art has seen significant shifts in the manner in which\nartworks are created, making understanding of creative processes a central\nquestion in technical art history. In the Renaissance and Early Modern period,\npaintings were largely produced by master painters directing workshops of\napprentices who often contributed to projects. The masters varied significantly\nin artistic and managerial styles, meaning different combinations of artists\nand implements might be seen both between masters and within workshops or even\nindividual canvases. Information on how different workshops were managed and\nthe processes by which artworks were created remains elusive. Machine learning\nmethods have potential to unearth new information about artists' creative\nprocesses by extending the analysis of brushwork to a microscopic scale.\nAnalysis of workshop paintings, however, presents a challenge in that\ndocumentation of the artists and materials involved is sparse, meaning external\nexamples are not available to train networks to recognize their contributions.\nHere we present a novel machine learning approach we call pairwise assignment\ntraining for classifying heterogeneity (PATCH) that is capable of identifying\nindividual artistic practice regimes with no external training data, or \"ground\ntruth.\" The method achieves unsupervised results by supervised means, and\noutperforms both simple statistical procedures and unsupervised machine\nlearning methods. We apply this method to two historical paintings by the\nSpanish Renaissance master, El Greco: The Baptism of Christ and Christ on the\nCross with Landscape, and our findings regarding the former potentially\nchallenge previous work that has assigned the painting to workshop members.\nFurther, the results of our analyses create a measure of heterogeneity of\nartistic practice that can be used to characterize artworks across time and\nspace.\n","authors":["Andrew Van Horn","Lauryn Smith","Mahamad Mahmoud","Michael McMaster","Clara Pinchbeck","Ina Martin","Andrew Lininger","Anthony Ingrisano","Adam Lowe","Carlos Bayod","Elizabeth Bolman","Kenneth Singer","Michael Hinczewski"],"pdf_url":"https://arxiv.org/pdf/2502.01912v2.pdf","comment":"main text: 16 pages, 6 figures; SI: 7 pages, 3 figures; v2: minor\n typo corrections, higher resolution figures"},{"id":"http://arxiv.org/abs/2405.13937v8","updated":"2025-03-03T05:10:46Z","published":"2024-05-22T19:10:24Z","title":"Node-Time Conditional Prompt Learning In Dynamic Graphs","summary":" Dynamic graphs capture evolving interactions between entities, such as in\nsocial networks, online learning platforms, and crowdsourcing projects. For\ndynamic graph modeling, dynamic graph neural networks (DGNNs) have emerged as a\nmainstream technique. However, they are generally pre-trained on the link\nprediction task, leaving a significant gap from the objectives of downstream\ntasks such as node classification. To bridge the gap, prompt-based learning has\ngained traction on graphs, but most existing efforts focus on static graphs,\nneglecting the evolution of dynamic graphs. In this paper, we propose\nDYGPROMPT, a novel pre-training and prompt learning framework for dynamic graph\nmodeling. First, we design dual prompts to address the gap in both task\nobjectives and temporal variations across pre-training and downstream tasks.\nSecond, we recognize that node and time features mutually characterize each\nother, and propose dual condition-nets to model the evolving node-time patterns\nin downstream tasks. Finally, we thoroughly evaluate and analyze DYGPROMPT\nthrough extensive experiments on four public datasets.\n","authors":["Xingtong Yu","Zhenghao Liu","Xinming Zhang","Yuan Fang"],"pdf_url":"https://arxiv.org/pdf/2405.13937v8.pdf","comment":"Accepted by ICLR 2025"},{"id":"http://arxiv.org/abs/2409.07002v2","updated":"2025-03-03T04:32:29Z","published":"2024-09-11T04:30:45Z","title":"AdvLogo: Adversarial Patch Attack against Object Detectors based on\n Diffusion Models","summary":" With the rapid development of deep learning, object detectors have\ndemonstrated impressive performance; however, vulnerabilities still exist in\ncertain scenarios. Current research exploring the vulnerabilities using\nadversarial patches often struggles to balance the trade-off between attack\neffectiveness and visual quality. To address this problem, we propose a novel\nframework of patch attack from semantic perspective, which we refer to as\nAdvLogo. Based on the hypothesis that every semantic space contains an\nadversarial subspace where images can cause detectors to fail in recognizing\nobjects, we leverage the semantic understanding of the diffusion denoising\nprocess and drive the process to adversarial subareas by perturbing the latent\nand unconditional embeddings at the last timestep. To mitigate the distribution\nshift that exposes a negative impact on image quality, we apply perturbation to\nthe latent in frequency domain with the Fourier Transform. Experimental results\ndemonstrate that AdvLogo achieves strong attack performance while maintaining\nhigh visual quality.\n","authors":["Boming Miao","Chunxiao Li","Yao Zhu","Weixiang Sun","Zizhe Wang","Xiaoyi Wang","Chuanlong Xie"],"pdf_url":"https://arxiv.org/pdf/2409.07002v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09906v3","updated":"2025-03-03T04:28:49Z","published":"2024-02-15T12:12:19Z","title":"Generative Representational Instruction Tuning","summary":" All text-based language problems can be reduced to either generation or\nembedding. Current models only perform well at one or the other. We introduce\ngenerative representational instruction tuning (GRIT) whereby a large language\nmodel is trained to handle both generative and embedding tasks by\ndistinguishing between them through instructions. Compared to other open\nmodels, our resulting GritLM 7B sets a new state of the art on the Massive Text\nEmbedding Benchmark (MTEB) and outperforms all models up to its size on a range\nof generative tasks. By scaling up further, GritLM 8x7B outperforms all open\ngenerative language models that we tried while still being among the best\nembedding models. Notably, we find that GRIT matches training on only\ngenerative or embedding data, thus we can unify both at no performance loss.\nAmong other benefits, the unification via GRIT speeds up Retrieval-Augmented\nGeneration (RAG) by > 60% for long documents, by no longer requiring separate\nretrieval and generation models. Models, code, etc. are freely available at\nhttps://github.com/ContextualAI/gritlm.\n","authors":["Niklas Muennighoff","Hongjin Su","Liang Wang","Nan Yang","Furu Wei","Tao Yu","Amanpreet Singh","Douwe Kiela"],"pdf_url":"https://arxiv.org/pdf/2402.09906v3.pdf","comment":"67 pages (16 main), 25 figures, 34 tables"},{"id":"http://arxiv.org/abs/2403.17010v3","updated":"2025-03-03T04:22:19Z","published":"2024-03-25T17:59:59Z","title":"Calib3D: Calibrating Model Preferences for Reliable 3D Scene\n Understanding","summary":" Safety-critical 3D scene understanding tasks necessitate not only accurate\nbut also confident predictions from 3D perception models. This study introduces\nCalib3D, a pioneering effort to benchmark and scrutinize the reliability of 3D\nscene understanding models from an uncertainty estimation viewpoint. We\ncomprehensively evaluate 28 state-of-the-art models across 10 diverse 3D\ndatasets, uncovering insightful phenomena that cope with both the aleatoric and\nepistemic uncertainties in 3D scene understanding. We discover that despite\nachieving impressive levels of accuracy, existing models frequently fail to\nprovide reliable uncertainty estimates -- a pitfall that critically undermines\ntheir applicability in safety-sensitive contexts. Through extensive analysis of\nkey factors such as network capacity, LiDAR representations, rasterization\nresolutions, and 3D data augmentation techniques, we correlate these aspects\ndirectly with the model calibration efficacy. Furthermore, we introduce DeptS,\na novel depth-aware scaling approach aimed at enhancing 3D model calibration.\nExtensive experiments across a wide range of configurations validate the\nsuperiority of our method. We hope this work could serve as a cornerstone for\nfostering reliable 3D scene understanding. Code and benchmark toolkit are\npublicly available.\n","authors":["Lingdong Kong","Xiang Xu","Jun Cen","Wenwei Zhang","Liang Pan","Kai Chen","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2403.17010v3.pdf","comment":"WACV 2025 Oral; 26 pages, 8 figures, 12 tables; Code at\n https://github.com/ldkong1205/Calib3D"},{"id":"http://arxiv.org/abs/2410.08892v2","updated":"2025-03-03T04:14:17Z","published":"2024-10-11T15:10:38Z","title":"Federated Learning in Practice: Reflections and Projections","summary":" Federated Learning (FL) is a machine learning technique that enables multiple\nentities to collaboratively learn a shared model without exchanging their local\ndata. Over the past decade, FL systems have achieved substantial progress,\nscaling to millions of devices across various learning domains while offering\nmeaningful differential privacy (DP) guarantees. Production systems from\norganizations like Google, Apple, and Meta demonstrate the real-world\napplicability of FL. However, key challenges remain, including verifying\nserver-side DP guarantees and coordinating training across heterogeneous\ndevices, limiting broader adoption. Additionally, emerging trends such as large\n(multi-modal) models and blurred lines between training, inference, and\npersonalization challenge traditional FL frameworks. In response, we propose a\nredefined FL framework that prioritizes privacy principles rather than rigid\ndefinitions. We also chart a path forward by leveraging trusted execution\nenvironments and open-source ecosystems to address these challenges and\nfacilitate future advancements in FL.\n","authors":["Katharine Daly","Hubert Eichner","Peter Kairouz","H. Brendan McMahan","Daniel Ramage","Zheng Xu"],"pdf_url":"https://arxiv.org/pdf/2410.08892v2.pdf","comment":"Published at 2024 IEEE 6th International Conference on Trust, Privacy\n and Security in Intelligent Systems, and Applications (TPS-ISA)"},{"id":"http://arxiv.org/abs/2411.02728v2","updated":"2025-03-03T04:04:30Z","published":"2024-11-05T01:55:07Z","title":"Compositional simulation-based inference for time series","summary":" Amortized simulation-based inference (SBI) methods train neural networks on\nsimulated data to perform Bayesian inference. While this strategy avoids the\nneed for tractable likelihoods, it often requires a large number of simulations\nand has been challenging to scale to time series data. Scientific simulators\nfrequently emulate real-world dynamics through thousands of single-state\ntransitions over time. We propose an SBI approach that can exploit such\nMarkovian simulators by locally identifying parameters consistent with\nindividual state transitions. We then compose these local results to obtain a\nposterior over parameters that align with the entire time series observation.\nWe focus on applying this approach to neural posterior score estimation but\nalso show how it can be applied, e.g., to neural likelihood (ratio) estimation.\nWe demonstrate that our approach is more simulation-efficient than directly\nestimating the global posterior on several synthetic benchmark tasks and\nsimulators used in ecology and epidemiology. Finally, we validate scalability\nand simulation efficiency of our approach by applying it to a high-dimensional\nKolmogorov flow simulator with around one million data dimensions.\n","authors":["Manuel Gloeckler","Shoji Toyota","Kenji Fukumizu","Jakob H. Macke"],"pdf_url":"https://arxiv.org/pdf/2411.02728v2.pdf","comment":"To be published in the proceedings of the Thirteenth International\n Conference on Learning Representations (ICLR 2025), Singapore, 2025"},{"id":"http://arxiv.org/abs/2502.02954v2","updated":"2025-03-03T03:56:38Z","published":"2025-02-05T07:35:15Z","title":"Direct Distributional Optimization for Provable Alignment of Diffusion\n Models","summary":" We introduce a novel alignment method for diffusion models from distribution\noptimization perspectives while providing rigorous convergence guarantees. We\nfirst formulate the problem as a generic regularized loss minimization over\nprobability distributions and directly optimize the distribution using the Dual\nAveraging method. Next, we enable sampling from the learned distribution by\napproximating its score function via Doob's $h$-transform technique. The\nproposed framework is supported by rigorous convergence guarantees and an\nend-to-end bound on the sampling error, which imply that when the original\ndistribution's score is known accurately, the complexity of sampling from\nshifted distributions is independent of isoperimetric conditions. This\nframework is broadly applicable to general distribution optimization problems,\nincluding alignment tasks in Reinforcement Learning with Human Feedback (RLHF),\nDirect Preference Optimization (DPO), and Kahneman-Tversky Optimization (KTO).\nWe empirically validate its performance on synthetic and image datasets using\nthe DPO objective.\n","authors":["Ryotaro Kawata","Kazusato Oko","Atsushi Nitanda","Taiji Suzuki"],"pdf_url":"https://arxiv.org/pdf/2502.02954v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00617v4","updated":"2025-03-03T03:41:11Z","published":"2024-06-30T08:00:34Z","title":"Iterative Nash Policy Optimization: Aligning LLMs with General\n Preferences via No-Regret Learning","summary":" Reinforcement Learning with Human Feedback (RLHF) has achieved great success\nin aligning large language models (LLMs) with human preferences. Prevalent RLHF\napproaches are reward-based, following the Bradley-Terry (BT) model assumption,\nwhich may not fully capture the complexity of human preferences. In this paper,\nwe explore RLHF under a general preference framework and approach it from a\ngame-theoretic perspective. Specifically, we formulate the problem as a\ntwo-player game and propose a novel online algorithm, iterative Nash policy\noptimization (INPO). The key idea is to let the policy play against itself via\nno-regret learning, thereby approximating the Nash policy. Unlike previous\nmethods, INPO bypasses the need for estimating the expected win rate for\nindividual responses, which typically incurs high computational or annotation\ncosts. Instead, we introduce a new loss objective that is directly minimized\nover a preference dataset. We provide theoretical analysis for our approach and\ndemonstrate its effectiveness through experiments on various representative\nbenchmarks. With an LLaMA-3-8B-based SFT model, INPO achieves a 42.6%\nlength-controlled win rate on AlpacaEval 2.0 and a 37.8% win rate on\nArena-Hard, showing substantial improvement over the state-of-the-art online\nRLHF algorithms.\n","authors":["Yuheng Zhang","Dian Yu","Baolin Peng","Linfeng Song","Ye Tian","Mingyue Huo","Nan Jiang","Haitao Mi","Dong Yu"],"pdf_url":"https://arxiv.org/pdf/2407.00617v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04405v3","updated":"2025-03-03T03:39:50Z","published":"2024-07-05T10:41:15Z","title":"Discovering physical laws with parallel combinatorial tree search","summary":" Symbolic regression plays a crucial role in modern scientific research thanks\nto its capability of discovering concise and interpretable mathematical\nexpressions from data. A grand challenge lies in the arduous search for\nparsimonious and generalizable mathematical formulas, in an infinite search\nspace, while intending to fit the training data. Existing algorithms have faced\na critical bottleneck of accuracy and efficiency over a decade when handling\nproblems of complexity, which essentially hinders the pace of applying symbolic\nregression for scientific exploration across interdisciplinary domains. To this\nend, we introduce a parallel combinatorial tree search (PCTS) model to\nefficiently distill generic mathematical expressions from limited data. Through\na series of extensive experiments, we demonstrate the superior accuracy and\nefficiency of PCTS for equation discovery, which greatly outperforms the\nstate-of-the-art baseline models on over 200 synthetic and experimental\ndatasets (e.g., lifting its performance by up to 99% accuracy improvement and\none-order of magnitude speed up). PCTS represents a key advance in accurate and\nefficient data-driven discovery of symbolic, interpretable models (e.g.,\nunderlying physical laws) and marks a pivotal transition towards scalable\nsymbolic learning.\n","authors":["Kai Ruan","Yilong Xu","Ze-Feng Gao","Yike Guo","Hao Sun","Ji-Rong Wen","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2407.04405v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.01117v2","updated":"2025-03-03T03:35:00Z","published":"2025-02-03T07:13:59Z","title":"Learning to Learn Weight Generation via Trajectory Diffusion","summary":" Diffusion-based algorithms have emerged as promising techniques for weight\ngeneration, particularly in scenarios like multi-task learning that require\nfrequent weight updates. However, existing solutions suffer from limited\ncross-task transferability. In addition, they only utilize optimal weights as\ntraining samples, ignoring the value of other weights in the optimization\nprocess. To address these issues, we propose Lt-Di, which integrates the\ndiffusion algorithm with meta-learning to generate weights for unseen tasks.\nFurthermore, we extend the vanilla diffusion algorithm into a trajectory\ndiffusion algorithm to utilize other weights along the optimization trajectory.\nTrajectory diffusion decomposes the entire diffusion chain into multiple\nshorter ones, improving training and inference efficiency. We analyze the\nconvergence properties of the weight generation paradigm and improve\nconvergence efficiency without additional time overhead. Our experiments\ndemonstrate Lt-Di's higher accuracy while reducing computational overhead\nacross various tasks, including zero-shot and few-shot learning, multi-domain\ngeneralization, and large-scale language model fine-tuning.Our code is released\nat https://anonymous.4open.science/r/Lt-Di-0E51.\n","authors":["Yunchuan Guan","Yu Liu","Ke Zhou","Zhiqi Shen","Serge Belongie","Jenq-Neng Hwang","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2502.01117v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17674v2","updated":"2025-03-03T03:24:09Z","published":"2024-07-24T23:47:05Z","title":"Struc2mapGAN: improving synthetic cryo-EM density maps with generative\n adversarial networks","summary":" Generating synthetic cryogenic electron microscopy 3D density maps from\nmolecular structures has potential important applications in structural\nbiology. Yet existing simulation-based methods cannot mimic all the complex\nfeatures present in experimental maps, such as secondary structure elements. As\nan alternative, we propose struc2mapGAN, a novel data-driven method that\nemploys a generative adversarial network to produce improved experimental-like\ndensity maps from molecular structures. More specifically, struc2mapGAN uses a\nnested U-Net architecture as the generator, with an additional L1 loss term and\nfurther processing of raw training experimental maps to enhance learning\nefficiency. While struc2mapGAN can promptly generate maps after training, we\ndemonstrate that it outperforms existing simulation-based methods for a wide\narray of tested maps and across various evaluation metrics.\n","authors":["Chenwei Zhang","Anne Condon","Khanh Dao Duc"],"pdf_url":"https://arxiv.org/pdf/2407.17674v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13213v2","updated":"2025-03-03T03:20:08Z","published":"2024-10-17T04:37:37Z","title":"LLMOPT: Learning to Define and Solve General Optimization Problems from\n Scratch","summary":" Optimization problems are prevalent across various scenarios. Formulating and\nthen solving optimization problems described by natural language often requires\nhighly specialized human expertise, which could block the widespread\napplication of optimization-based decision making. To automate problem\nformulation and solving, leveraging large language models (LLMs) has emerged as\na potential way. However, this kind of approach suffers from the issue of\noptimization generalization. Namely, the accuracy of most current LLM-based\nmethods and the generality of optimization problem types that they can model\nare still limited. In this paper, we propose a unified learning-based framework\ncalled LLMOPT to boost optimization generalization. Starting from the natural\nlanguage descriptions of optimization problems and a pre-trained LLM, LLMOPT\nconstructs the introduced five-element formulation as a universal model for\nlearning to define diverse optimization problem types. Then, LLMOPT employs the\nmulti-instruction tuning to enhance both problem formalization and solver code\ngeneration accuracy and generality. After that, to prevent hallucinations in\nLLMs, such as sacrificing solving accuracy to avoid execution errors, the model\nalignment and self-correction mechanism are adopted in LLMOPT. We evaluate the\noptimization generalization ability of LLMOPT and compared methods across six\nreal-world datasets covering roughly 20 fields such as health, environment,\nenergy and manufacturing, etc. Extensive experiment results show that LLMOPT is\nable to model various optimization problem types such as linear/nonlinear\nprogramming, mixed integer programming, and combinatorial optimization, and\nachieves a notable 11.08% average solving accuracy improvement compared with\nthe state-of-the-art methods. The code is available at\nhttps://github.com/caigaojiang/LLMOPT.\n","authors":["Caigao Jiang","Xiang Shu","Hong Qian","Xingyu Lu","Jun Zhou","Aimin Zhou","Yang Yu"],"pdf_url":"https://arxiv.org/pdf/2410.13213v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.00645v2","updated":"2025-03-03T03:19:08Z","published":"2024-10-01T12:58:37Z","title":"TSVD: Bridging Theory and Practice in Continual Learning with\n Pre-trained Models","summary":" The goal of continual learning (CL) is to train a model that can solve\nmultiple tasks presented sequentially. Recent CL approaches have achieved\nstrong performance by leveraging large pre-trained models that generalize well\nto downstream tasks. However, such methods lack theoretical guarantees, making\nthem prone to unexpected failures. Conversely, principled CL approaches often\nfail to achieve competitive performance. In this work, we aim to bridge this\ngap between theory and practice by designing a simple CL method that is\ntheoretically sound and highly performant. Specifically, we lift pre-trained\nfeatures into a higher dimensional space and formulate an over-parametrized\nminimum-norm least-squares problem. We find that the lifted features are highly\nill-conditioned, potentially leading to large training errors (numerical\ninstability) and increased generalization errors. We address these challenges\nby continually truncating the singular value decomposition (SVD) of the lifted\nfeatures. Our approach, termed TSVD, is stable with respect to the choice of\nhyperparameters, can handle hundreds of tasks, and outperforms state-of-the-art\nCL methods on multiple datasets. Importantly, our method satisfies a recurrence\nrelation throughout its continual learning process, which allows us to prove it\nmaintains small training and generalization errors by appropriately truncating\na fraction of SVD factors. This results in a stable continual learning method\nwith strong empirical performance and theoretical guarantees. Code available:\nhttps://github.com/liangzu/tsvd.\n","authors":["Liangzu Peng","Juan Elenter","Joshua Agterberg","Alejandro Ribeiro","René Vidal"],"pdf_url":"https://arxiv.org/pdf/2410.00645v2.pdf","comment":"47 pages, 18 figures, 16 tables (v2, accepted to ICLR 2025)"},{"id":"http://arxiv.org/abs/2410.13085v2","updated":"2025-03-03T03:08:28Z","published":"2024-10-16T23:03:27Z","title":"MMed-RAG: Versatile Multimodal RAG System for Medical Vision Language\n Models","summary":" Artificial Intelligence (AI) has demonstrated significant potential in\nhealthcare, particularly in disease diagnosis and treatment planning. Recent\nprogress in Medical Large Vision-Language Models (Med-LVLMs) has opened up new\npossibilities for interactive diagnostic tools. However, these models often\nsuffer from factual hallucination, which can lead to incorrect diagnoses.\nFine-tuning and retrieval-augmented generation (RAG) have emerged as methods to\naddress these issues. However, the amount of high-quality data and distribution\nshifts between training data and deployment data limit the application of\nfine-tuning methods. Although RAG is lightweight and effective, existing\nRAG-based approaches are not sufficiently general to different medical domains\nand can potentially cause misalignment issues, both between modalities and\nbetween the model and the ground truth. In this paper, we propose a versatile\nmultimodal RAG system, MMed-RAG, designed to enhance the factuality of\nMed-LVLMs. Our approach introduces a domain-aware retrieval mechanism, an\nadaptive retrieved contexts selection method, and a provable RAG-based\npreference fine-tuning strategy. These innovations make the RAG process\nsufficiently general and reliable, significantly improving alignment when\nintroducing retrieved contexts. Experimental results across five medical\ndatasets (involving radiology, ophthalmology, pathology) on medical VQA and\nreport generation demonstrate that MMed-RAG can achieve an average improvement\nof 43.8% in the factual accuracy of Med-LVLMs. Our data and code are available\nin https://github.com/richard-peng-xia/MMed-RAG.\n","authors":["Peng Xia","Kangyu Zhu","Haoran Li","Tianze Wang","Weijia Shi","Sheng Wang","Linjun Zhang","James Zou","Huaxiu Yao"],"pdf_url":"https://arxiv.org/pdf/2410.13085v2.pdf","comment":"ICLR 2025"},{"id":"http://arxiv.org/abs/2406.06600v3","updated":"2025-03-03T03:05:30Z","published":"2024-06-06T13:44:57Z","title":"HORAE: A Domain-Agnostic Modeling Language for Automating Multimodal\n Service Regulation","summary":" Artificial intelligence is rapidly encroaching on the field of service\nregulation. This work-in-progress article presents the design principles behind\nHORAE, a unified specification language to model multimodal regulation rules\nacross a diverse set of domains. We show how HORAE facilitates an intelligent\nservice regulation pipeline by further exploiting a fine-tuned large language\nmodel named HORAE that automates the HORAE modeling process, thereby yielding\nan end-to-end framework for fully automated intelligent service regulation.\n","authors":["Yutao Sun","Mingshuai Chen","Kangjia Zhao","Jintao Chen"],"pdf_url":"https://arxiv.org/pdf/2406.06600v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.00564v3","updated":"2025-03-03T02:59:29Z","published":"2024-10-01T10:25:03Z","title":"Scaling Offline Model-Based RL via Jointly-Optimized World-Action Model\n Pretraining","summary":" A significant aspiration of offline reinforcement learning (RL) is to develop\na generalist agent with high capabilities from large and heterogeneous\ndatasets. However, prior approaches that scale offline RL either rely heavily\non expert trajectories or struggle to generalize to diverse unseen tasks.\nInspired by the excellent generalization of world model in conditional video\ngeneration, we explore the potential of image observation-based world model for\nscaling offline RL and enhancing generalization on novel tasks. In this paper,\nwe introduce JOWA: Jointly-Optimized World-Action model, an offline model-based\nRL agent pretrained on multiple Atari games with 6 billion tokens data to learn\ngeneral-purpose representation and decision-making ability. Our method jointly\noptimizes a world-action model through a shared transformer backbone, which\nstabilize temporal difference learning with large models during pretraining.\nMoreover, we propose a provably efficient and parallelizable planning algorithm\nto compensate for the Q-value estimation error and thus search out better\npolicies. Experimental results indicate that our largest agent, with 150\nmillion parameters, achieves 78.9% human-level performance on pretrained games\nusing only 10% subsampled offline data, outperforming existing state-of-the-art\nlarge-scale offline RL baselines by 31.6% on averange. Furthermore, JOWA scales\nfavorably with model capacity and can sample-efficiently transfer to novel\ngames using only 5k offline fine-tuning data (approximately 4 trajectories) per\ngame, demonstrating superior generalization. We will release codes and model\nweights at https://github.com/CJReinforce/JOWA\n","authors":["Jie Cheng","Ruixi Qiao","Yingwei Ma","Binhua Li","Gang Xiong","Qinghai Miao","Yongbin Li","Yisheng Lv"],"pdf_url":"https://arxiv.org/pdf/2410.00564v3.pdf","comment":"Accepted by ICLR 2025"},{"id":"http://arxiv.org/abs/2410.01337v3","updated":"2025-03-03T02:50:30Z","published":"2024-10-02T08:54:18Z","title":"PhyMPGN: Physics-encoded Message Passing Graph Network for\n spatiotemporal PDE systems","summary":" Solving partial differential equations (PDEs) serves as a cornerstone for\nmodeling complex dynamical systems. Recent progresses have demonstrated grand\nbenefits of data-driven neural-based models for predicting spatiotemporal\ndynamics (e.g., tremendous speedup gain compared with classical numerical\nmethods). However, most existing neural models rely on rich training data, have\nlimited extrapolation and generalization abilities, and suffer to produce\nprecise or reliable physical prediction under intricate conditions (e.g.,\nirregular mesh or geometry, complex boundary conditions, diverse PDE\nparameters, etc.). To this end, we propose a new graph learning approach,\nnamely, Physics-encoded Message Passing Graph Network (PhyMPGN), to model\nspatiotemporal PDE systems on irregular meshes given small training datasets.\nSpecifically, we incorporate a GNN into a numerical integrator to approximate\nthe temporal marching of spatiotemporal dynamics for a given PDE system.\nConsidering that many physical phenomena are governed by diffusion processes,\nwe further design a learnable Laplace block, which encodes the discrete\nLaplace-Beltrami operator, to aid and guide the GNN learning in a physically\nfeasible solution space. A boundary condition padding strategy is also designed\nto improve the model convergence and accuracy. Extensive experiments\ndemonstrate that PhyMPGN is capable of accurately predicting various types of\nspatiotemporal dynamics on coarse unstructured meshes, consistently achieves\nthe state-of-the-art results, and outperforms other baselines with considerable\ngains.\n","authors":["Bocheng Zeng","Qi Wang","Mengtao Yan","Yang Liu","Ruizhi Chengze","Yi Zhang","Hongsheng Liu","Zidong Wang","Hao Sun"],"pdf_url":"https://arxiv.org/pdf/2410.01337v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.08109v3","updated":"2025-03-03T02:45:58Z","published":"2024-10-10T16:56:05Z","title":"A Closer Look at Machine Unlearning for Large Language Models","summary":" Large language models (LLMs) may memorize sensitive or copyrighted content,\nraising privacy and legal concerns. Due to the high cost of retraining from\nscratch, researchers attempt to employ machine unlearning to remove specific\ncontent from LLMs while preserving the overall performance. In this paper, we\ndiscuss several issues in machine unlearning for LLMs and provide our insights\non possible approaches. To address the issue of inadequate evaluation of model\noutputs after unlearning, we introduce three additional metrics to evaluate\ntoken diversity, sentence semantics, and factual correctness. We then\ncategorize unlearning methods into untargeted and targeted, and discuss their\nissues respectively. Specifically, the behavior that untargeted unlearning\nattempts to approximate is unpredictable and may involve hallucinations, and\nexisting regularization is insufficient for targeted unlearning. To alleviate\nthese issues, we propose using the objective of maximizing entropy (ME) for\nuntargeted unlearning and incorporate answer preservation (AP) loss as\nregularization for targeted unlearning. Experimental results across three\nscenarios, i.e., fictitious unlearning, continual unlearning, and real-world\nunlearning, demonstrate the effectiveness of our approaches. The code is\navailable at https://github.com/sail-sg/closer-look-LLM-unlearning.\n","authors":["Xiaojian Yuan","Tianyu Pang","Chao Du","Kejiang Chen","Weiming Zhang","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2410.08109v3.pdf","comment":"ICLR 2025"},{"id":"http://arxiv.org/abs/2411.18872v2","updated":"2025-03-03T02:41:10Z","published":"2024-11-28T02:50:42Z","title":"A Lean Dataset for International Math Olympiad: Small Steps towards\n Writing Math Proofs for Hard Problems","summary":" Using AI to write formal proofs for mathematical problems is a challenging\ntask that has seen some advancements in recent years. Automated systems such as\nLean can verify the correctness of proofs written in formal language, yet\nwriting the proofs in formal language can be challenging for humans and\nmachines. The miniF2F benchmark has 20 IMO problems in its test set, yet formal\nproofs are available only for 6 of these problems (3 of which are only written\nby mathematicians). The model with best accuracy can only prove 2 of these 20\nIMO problems, from 1950s and 60s, while its training set is a secret. In this\nwork, we write complete, original formal proofs for the remaining IMO problems\nin Lean along with 3 extra problems from IMO 2022 and 2023. This effort expands\nthe availability of proof currently in the public domain by creating 5,880\nlines of Lean proof. The goal of the paper is to pave the way for developing AI\nmodels that can automatically write the formal proofs for all the IMO problems\nin miniF2F and beyond by providing an evaluation benchmark. In this pursuit, we\ndevise a method to decompose the proofs of these problems into their building\nblocks, constructing a dataset of 1,329 lemmas with more than 40k lines of Lean\ncode. These lemmas are not trivial, yet they are approachable, providing the\nopportunity to evaluate and diagnose the failures and successes of AI models.\nWe evaluate the ability of the SOTA LLMs on our dataset and analyze their\nsuccess and failure modes from different perspectives. Our dataset and code is\navailable at: https://github.com/roozbeh-yz/IMO-Steps.\n","authors":["Roozbeh Yousefzadeh","Xuenan Cao","Azim Ospanov"],"pdf_url":"https://arxiv.org/pdf/2411.18872v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.21186v2","updated":"2025-03-03T02:33:31Z","published":"2025-02-28T16:02:23Z","title":"Scalable Decision-Making in Stochastic Environments through Learned\n Temporal Abstraction","summary":" Sequential decision-making in high-dimensional continuous action spaces,\nparticularly in stochastic environments, faces significant computational\nchallenges. We explore this challenge in the traditional offline RL setting,\nwhere an agent must learn how to make decisions based on data collected through\na stochastic behavior policy. We present Latent Macro Action Planner (L-MAP),\nwhich addresses this challenge by learning a set of temporally extended\nmacro-actions through a state-conditional Vector Quantized Variational\nAutoencoder (VQ-VAE), effectively reducing action dimensionality. L-MAP employs\na (separate) learned prior model that acts as a latent transition model and\nallows efficient sampling of plausible actions. During planning, our approach\naccounts for stochasticity in both the environment and the behavior policy by\nusing Monte Carlo tree search (MCTS). In offline RL settings, including\nstochastic continuous control tasks, L-MAP efficiently searches over discrete\nlatent actions to yield high expected returns. Empirical results demonstrate\nthat L-MAP maintains low decision latency despite increased action\ndimensionality. Notably, across tasks ranging from continuous control with\ninherently stochastic dynamics to high-dimensional robotic hand manipulation,\nL-MAP significantly outperforms existing model-based methods and performs\non-par with strong model-free actor-critic baselines, highlighting the\neffectiveness of the proposed approach in planning in complex and stochastic\nenvironments with high-dimensional action spaces.\n","authors":["Baiting Luo","Ava Pettet","Aron Laszka","Abhishek Dubey","Ayan Mukhopadhyay"],"pdf_url":"https://arxiv.org/pdf/2502.21186v2.pdf","comment":"Accepted by ICLR2025. Code would be available at\n https://github.com/BaitingLuo/L-MAP.git"},{"id":"http://arxiv.org/abs/2412.01021v2","updated":"2025-03-03T02:13:49Z","published":"2024-12-02T00:41:25Z","title":"On the Feature Learning in Diffusion Models","summary":" The predominant success of diffusion models in generative modeling has\nspurred significant interest in understanding their theoretical foundations. In\nthis work, we propose a feature learning framework aimed at analyzing and\ncomparing the training dynamics of diffusion models with those of traditional\nclassification models. Our theoretical analysis demonstrates that diffusion\nmodels, due to the denoising objective, are encouraged to learn more balanced\nand comprehensive representations of the data. In contrast, neural networks\nwith a similar architecture trained for classification tend to prioritize\nlearning specific patterns in the data, often focusing on easy-to-learn\ncomponents. To support these theoretical insights, we conduct several\nexperiments on both synthetic and real-world datasets, which empirically\nvalidate our findings and highlight the distinct feature learning dynamics in\ndiffusion models compared to classification.\n","authors":["Andi Han","Wei Huang","Yuan Cao","Difan Zou"],"pdf_url":"https://arxiv.org/pdf/2412.01021v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19228v3","updated":"2025-03-03T02:06:52Z","published":"2024-04-30T03:15:04Z","title":"Weighted Point Set Embedding for Multimodal Contrastive Learning Toward\n Optimal Similarity Metric","summary":" In typical multimodal contrastive learning, such as CLIP, encoders produce\none point in the latent representation space for each input. However, one-point\nrepresentation has difficulty in capturing the relationship and the similarity\nstructure of a huge amount of instances in the real world. For richer classes\nof the similarity, we propose the use of weighted point sets, namely, sets of\npairs of weight and vector, as representations of instances. In this work, we\ntheoretically show the benefit of our proposed method through a new\nunderstanding of the contrastive loss of CLIP, which we call symmetric InfoNCE.\nWe clarify that the optimal similarity that minimizes symmetric InfoNCE is the\npointwise mutual information, and show an upper bound of excess risk on\ndownstream classification tasks of representations that achieve the optimal\nsimilarity. In addition, we show that our proposed similarity based on weighted\npoint sets consistently achieves the optimal similarity. To verify the\neffectiveness of our proposed method, we demonstrate pretraining of text-image\nrepresentation models and classification tasks on common benchmarks.\n","authors":["Toshimitsu Uesaka","Taiji Suzuki","Yuhta Takida","Chieh-Hsin Lai","Naoki Murata","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2404.19228v3.pdf","comment":"ICLR 2025 (Spotlight)"},{"id":"http://arxiv.org/abs/2309.13838v2","updated":"2025-03-03T01:47:00Z","published":"2023-09-25T02:50:22Z","title":"Penalized Principal Component Analysis Using Smoothing","summary":" Principal components computed via PCA (principal component analysis) are\ntraditionally used to reduce dimensionality in genomic data or to correct for\npopulation stratification. In this paper, we explore the penalized eigenvalue\nproblem (PEP) which reformulates the computation of the first eigenvector as an\noptimization problem and adds an $L_1$ penalty constraint to enforce sparseness\nof the solution. The contribution of our article is threefold. First, we extend\nPEP by applying smoothing to the original LASSO-type $L_1$ penalty. This allows\none to compute analytical gradients which enable faster and more efficient\nminimization of the objective function associated with the optimization\nproblem. Second, we demonstrate how higher order eigenvectors can be calculated\nwith PEP using established results from singular value decomposition (SVD).\nThird, we present four experimental studies to demonstrate the usefulness of\nthe smoothed penalized eigenvectors. Using data from the 1000 Genomes Project\ndataset, we empirically demonstrate that our proposed smoothed PEP allows one\nto increase numerical stability and obtain meaningful eigenvectors. We also\nemploy the penalized eigenvector approach in two additional real data\napplications (computation of a polygenic risk score and clustering),\ndemonstrating that exchanging the penalized eigenvectors for their smoothed\ncounterparts can increase prediction accuracy in polygenic risk scores and\nenhance discernibility of clusterings. Moreover, we compare our proposed\nsmoothed PEP to seven state-of-the-art algorithms for sparse PCA and evaluate\nthe accuracy of the obtained eigenvectors, their support recovery, and their\nruntime.\n","authors":["Rebecca M. Hurwitz","Georg Hahn"],"pdf_url":"https://arxiv.org/pdf/2309.13838v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02060v2","updated":"2025-03-03T01:25:46Z","published":"2024-09-03T17:08:20Z","title":"OLMoE: Open Mixture-of-Experts Language Models","summary":" We introduce OLMoE, a fully open, state-of-the-art language model leveraging\nsparse Mixture-of-Experts (MoE). OLMoE-1B-7B has 7 billion (B) parameters but\nuses only 1B per input token. We pretrain it on 5 trillion tokens and further\nadapt it to create OLMoE-1B-7B-Instruct. Our models outperform all available\nmodels with similar active parameters, even surpassing larger ones like\nLlama2-13B-Chat and DeepSeekMoE-16B. We present various experiments on MoE\ntraining, analyze routing in our model showing high specialization, and\nopen-source all aspects of our work: model weights, training data, code, and\nlogs.\n","authors":["Niklas Muennighoff","Luca Soldaini","Dirk Groeneveld","Kyle Lo","Jacob Morrison","Sewon Min","Weijia Shi","Pete Walsh","Oyvind Tafjord","Nathan Lambert","Yuling Gu","Shane Arora","Akshita Bhagia","Dustin Schwenk","David Wadden","Alexander Wettig","Binyuan Hui","Tim Dettmers","Douwe Kiela","Ali Farhadi","Noah A. Smith","Pang Wei Koh","Amanpreet Singh","Hannaneh Hajishirzi"],"pdf_url":"https://arxiv.org/pdf/2409.02060v2.pdf","comment":"63 pages (24 main), 36 figures, 17 tables"},{"id":"http://arxiv.org/abs/2407.10223v2","updated":"2025-03-03T01:21:39Z","published":"2024-07-14T14:26:17Z","title":"On Large Language Model Continual Unlearning","summary":" While large language models have demonstrated impressive performance across\nvarious domains and tasks, their security issues have become increasingly\nsevere. Machine unlearning has emerged as a representative approach for model\nsafety and security by removing the influence of undesired data on the target\nmodel. However, these methods do not sufficiently consider that unlearning\nrequests in real-world scenarios are continuously emerging, especially in the\ncontext of LLMs, which may lead to accumulated model utility loss that\neventually becomes unacceptable. Moreover, existing LLM unlearning methods\noften ignore previous data access limitations due to privacy concerns and\ncopyright protection. Without previous data, the utility preservation during\nunlearning is much harder. To overcome these challenges, we propose the OOO\nframework that includes an Orthogonal low-rank adapter (LoRA) for continually\nunlearning requested data and an Out-Of-Distribution (OOD) detector to measure\nthe similarity between input and unlearning data. The orthogonal LoRA achieves\nparameter disentanglement among continual unlearning requests. The OOD detector\nis trained with a novel contrastive entropy loss and utilizes a glocal-aware\nscoring mechanism. During inference, our OOO framework can decide whether and\nto what extent to load the unlearning LoRA based on the OOD detector's\npredicted similarity between the input and the unlearned knowledge. Notably,\nOOO's effectiveness does not rely on any retained data. We conducted extensive\nexperiments on OOO and state-of-the-art LLM unlearning methods across three\ntasks and seven datasets. The results indicate that OOO consistently achieves\nthe best unlearning effectiveness and utility preservation, especially when\nfacing continuous unlearning requests. The source codes can be found at\nhttps://github.com/GCYZSL/O3-LLM-UNLEARNING.\n","authors":["Chongyang Gao","Lixu Wang","Kaize Ding","Chenkai Weng","Xiao Wang","Qi Zhu"],"pdf_url":"https://arxiv.org/pdf/2407.10223v2.pdf","comment":"This paper has been accepted by ICLR 2025. The first two authors\n contribute equally and they are ordered alphabetically"},{"id":"http://arxiv.org/abs/2407.10967v2","updated":"2025-03-03T01:19:23Z","published":"2024-07-15T17:59:23Z","title":"BECAUSE: Bilinear Causal Representation for Generalizable Offline\n Model-based Reinforcement Learning","summary":" Offline model-based reinforcement learning (MBRL) enhances data efficiency by\nutilizing pre-collected datasets to learn models and policies, especially in\nscenarios where exploration is costly or infeasible. Nevertheless, its\nperformance often suffers from the objective mismatch between model and policy\nlearning, resulting in inferior performance despite accurate model predictions.\nThis paper first identifies the primary source of this mismatch comes from the\nunderlying confounders present in offline data for MBRL. Subsequently, we\nintroduce \\textbf{B}ilin\\textbf{E}ar \\textbf{CAUS}al\nr\\textbf{E}presentation~(BECAUSE), an algorithm to capture causal\nrepresentation for both states and actions to reduce the influence of the\ndistribution shift, thus mitigating the objective mismatch problem.\nComprehensive evaluations on 18 tasks that vary in data quality and environment\ncontext demonstrate the superior performance of BECAUSE over existing offline\nRL algorithms. We show the generalizability and robustness of BECAUSE under\nfewer samples or larger numbers of confounders. Additionally, we offer\ntheoretical analysis of BECAUSE to prove its error bound and sample efficiency\nwhen integrating causal representation into offline MBRL.\n","authors":["Haohong Lin","Wenhao Ding","Jian Chen","Laixi Shi","Jiacheng Zhu","Bo Li","Ding Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.10967v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14519v2","updated":"2025-03-03T00:51:41Z","published":"2024-09-22T16:25:31Z","title":"RobotFingerPrint: Unified Gripper Coordinate Space for Multi-Gripper\n Grasp Synthesis and Transfer","summary":" We introduce a novel grasp representation named the Unified Gripper\nCoordinate Space (UGCS) for grasp synthesis and grasp transfer. Our\nrepresentation leverages spherical coordinates to create a shared coordinate\nspace across different robot grippers, enabling it to synthesize and transfer\ngrasps for both novel objects and previously unseen grippers. The strength of\nthis representation lies in the ability to map palm and fingers of a gripper\nand the unified coordinate space. Grasp synthesis is formulated as predicting\nthe unified spherical coordinates on object surface points via a conditional\nvariational autoencoder. The predicted unified gripper coordinates establish\nexact correspondences between the gripper and object points, which is used to\noptimize grasp pose and joint values. Grasp transfer is facilitated through the\npoint-to-point correspondence between any two (potentially unseen) grippers and\nsolved via a similar optimization. Extensive simulation and real-world\nexperiments showcase the efficacy of the unified grasp representation for grasp\nsynthesis in generating stable and diverse grasps. Similarly, we showcase\nreal-world grasp transfer from human demonstrations across different objects.\n","authors":["Ninad Khargonkar","Luis Felipe Casas","Balakrishnan Prabhakaran","Yu Xiang"],"pdf_url":"https://arxiv.org/pdf/2409.14519v2.pdf","comment":"8 pages, 11 figures, 3 tables. Project page available at\n https://irvlutd.github.io/RobotFingerPrint"},{"id":"http://arxiv.org/abs/2410.01417v2","updated":"2025-03-03T00:41:36Z","published":"2024-10-02T10:58:54Z","title":"The Labyrinth of Links: Navigating the Associative Maze of Multi-modal\n LLMs","summary":" Multi-modal Large Language Models (MLLMs) have exhibited impressive\ncapability. However, recently many deficiencies of MLLMs have been found\ncompared to human intelligence, $\\textit{e.g.}$, hallucination. To drive the\nMLLMs study, the community dedicated efforts to building larger benchmarks with\ncomplex tasks. In this paper, we propose benchmarking an essential but usually\noverlooked intelligence: $\\textbf{association}$, a human's basic capability to\nlink observation and prior practice memory. To comprehensively investigate\nMLLM's performance on the association, we formulate the association task and\ndevise a standard benchmark based on adjective and verb semantic concepts.\nInstead of costly data annotation and curation, we propose a convenient\n$\\textbf{annotation-free}$ construction method transforming the general dataset\nfor our association tasks. Simultaneously, we devise a rigorous data refinement\nprocess to eliminate confusion in the raw dataset. Building on this database,\nwe establish three levels of association tasks: single-step, synchronous, and\nasynchronous associations. Moreover, we conduct a comprehensive investigation\ninto the MLLMs' zero-shot association capabilities, addressing multiple\ndimensions, including three distinct memory strategies, both open-source and\nclosed-source MLLMs, cutting-edge Mixture-of-Experts (MoE) models, and the\ninvolvement of human experts. Our systematic investigation shows that current\nopen-source MLLMs consistently exhibit poor capability in our association\ntasks, even the currently state-of-the-art GPT-4V(vision) also has a\nsignificant gap compared to humans. We believe our benchmark would pave the way\nfor future MLLM studies. $\\textit{Our data and code are available at:}$\nhttps://mvig-rhos.com/llm_inception.\n","authors":["Hong Li","Nanxi Li","Yuanjie Chen","Jianbin Zhu","Qinlu Guo","Cewu Lu","Yong-Lu Li"],"pdf_url":"https://arxiv.org/pdf/2410.01417v2.pdf","comment":"Accepted by ICLR 2025. Project page:\n https://mvig-rhos.com/llm_inception"},{"id":"http://arxiv.org/abs/2405.02318v2","updated":"2025-03-03T00:38:48Z","published":"2024-04-18T00:20:48Z","title":"NL2FOL: Translating Natural Language to First-Order Logic for Logical\n Fallacy Detection","summary":" Translating natural language into formal language such as First-Order Logic\n(FOL) is a foundational challenge in NLP with wide-ranging applications in\nautomated reasoning, misinformation tracking, and knowledge validation. In this\npaper, we introduce Natural Language to First-Order Logic (NL2FOL), a framework\nto autoformalize natural language to FOL step by step using Large Language\nModels (LLMs). Our approach addresses key challenges in this translation\nprocess, including the integration of implicit background knowledge. By\nleveraging structured representations generated by NL2FOL, we use\nSatisfiability Modulo Theory (SMT) solvers to reason about the logical validity\nof natural language statements. We present logical fallacy detection as a case\nstudy to evaluate the efficacy of NL2FOL. Being neurosymbolic, our approach\nalso provides interpretable insights into the reasoning process and\ndemonstrates robustness without requiring model fine-tuning or labeled training\ndata. Our framework achieves strong performance on multiple datasets. On the\nLOGIC dataset, NL2FOL achieves an F1-score of 78%, while generalizing\neffectively to the LOGICCLIMATE dataset with an F1-score of 80%.\n","authors":["Abhinav Lalwani","Tasha Kim","Lovish Chopra","Christopher Hahn","Zhijing Jin","Mrinmaya Sachan"],"pdf_url":"https://arxiv.org/pdf/2405.02318v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22729v2","updated":"2025-03-03T00:23:28Z","published":"2024-10-30T06:28:21Z","title":"Identifying Drift, Diffusion, and Causal Structure from Temporal\n Snapshots","summary":" Stochastic differential equations (SDEs) are a fundamental tool for modelling\ndynamic processes, including gene regulatory networks (GRNs), contaminant\ntransport, financial markets, and image generation. However, learning the\nunderlying SDE from data is a challenging task, especially if individual\ntrajectories are not observable. Motivated by burgeoning research in\nsingle-cell datasets, we present the first comprehensive approach for jointly\nidentifying the drift and diffusion of an SDE from its temporal marginals.\nAssuming linear drift and additive diffusion, we prove that these parameters\nare identifiable from marginals if and only if the initial distribution lacks\nany generalized rotational symmetries. We further prove that the causal graph\nof any SDE with additive diffusion can be recovered from the SDE parameters. To\ncomplement this theory, we adapt entropy-regularized optimal transport to\nhandle anisotropic diffusion, and introduce APPEX (Alternating Projection\nParameter Estimation from $X_0$), an iterative algorithm designed to estimate\nthe drift, diffusion, and causal graph of an additive noise SDE, solely from\ntemporal marginals. We show that APPEX iteratively decreases Kullback-Leibler\ndivergence to the true solution, and demonstrate its effectiveness on simulated\ndata from linear additive noise SDEs.\n","authors":["Vincent Guan","Joseph Janssen","Hossein Rahmani","Andrew Warren","Stephen Zhang","Elina Robeva","Geoffrey Schiebinger"],"pdf_url":"https://arxiv.org/pdf/2410.22729v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16002v2","updated":"2025-03-03T00:11:11Z","published":"2024-05-25T01:44:35Z","title":"Does SGD really happen in tiny subspaces?","summary":" Understanding the training dynamics of deep neural networks is challenging\ndue to their high-dimensional nature and intricate loss landscapes. Recent\nstudies have revealed that, along the training trajectory, the gradient\napproximately aligns with a low-rank top eigenspace of the training loss\nHessian, referred to as the dominant subspace. Given this alignment, this paper\nexplores whether neural networks can be trained within the dominant subspace,\nwhich, if feasible, could lead to more efficient training methods. Our primary\nobservation is that when the SGD update is projected onto the dominant\nsubspace, the training loss does not decrease further. This suggests that the\nobserved alignment between the gradient and the dominant subspace is spurious.\nSurprisingly, projecting out the dominant subspace proves to be just as\neffective as the original update, despite removing the majority of the original\nupdate component. We observe similar behavior across practical setups,\nincluding the large learning rate regime (also known as Edge of Stability),\nSharpness-Aware Minimization, momentum, and adaptive optimizers. We discuss the\nmain causes and implications of this spurious alignment, shedding light on the\ndynamics of neural network training.\n","authors":["Minhak Song","Kwangjun Ahn","Chulhee Yun"],"pdf_url":"https://arxiv.org/pdf/2405.16002v2.pdf","comment":"Published at ICLR 2025"},{"id":"http://arxiv.org/abs/2401.15262v2","updated":"2025-03-03T00:04:46Z","published":"2024-01-27T01:16:33Z","title":"Asymptotic Behavior of Adversarial Training Estimator under\n $\\ell_\\infty$-Perturbation","summary":" Adversarial training has been proposed to protect machine learning models\nagainst adversarial attacks. This paper focuses on adversarial training under\n$\\ell_\\infty$-perturbation, which has recently attracted much research\nattention. The asymptotic behavior of the adversarial training estimator is\ninvestigated in the generalized linear model. The results imply that the\nasymptotic distribution of the adversarial training estimator under\n$\\ell_\\infty$-perturbation could put a positive probability mass at $0$ when\nthe true parameter is $0$, providing a theoretical guarantee of the associated\nsparsity-recovery ability. Alternatively, a two-step procedure is proposed --\nadaptive adversarial training, which could further improve the performance of\nadversarial training under $\\ell_\\infty$-perturbation. Specifically, the\nproposed procedure could achieve asymptotic variable-selection consistency and\nunbiasedness. Numerical experiments are conducted to show the sparsity-recovery\nability of adversarial training under $\\ell_\\infty$-perturbation and to compare\nthe empirical performance between classic adversarial training and adaptive\nadversarial training.\n","authors":["Yiling Xie","Xiaoming Huo"],"pdf_url":"https://arxiv.org/pdf/2401.15262v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2503.01980v1","updated":"2025-03-03T19:01:17Z","published":"2025-03-03T19:01:17Z","title":"Recurrence-Enhanced Vision-and-Language Transformers for Robust\n Multimodal Document Retrieval","summary":" Cross-modal retrieval is gaining increasing efficacy and interest from the\nresearch community, thanks to large-scale training, novel architectural and\nlearning designs, and its application in LLMs and multimodal LLMs. In this\npaper, we move a step forward and design an approach that allows for multimodal\nqueries, composed of both an image and a text, and can search within\ncollections of multimodal documents, where images and text are interleaved. Our\nmodel, ReT, employs multi-level representations extracted from different layers\nof both visual and textual backbones, both at the query and document side. To\nallow for multi-level and cross-modal understanding and feature extraction, ReT\nemploys a novel Transformer-based recurrent cell that integrates both textual\nand visual features at different layers, and leverages sigmoidal gates inspired\nby the classical design of LSTMs. Extensive experiments on M2KR and M-BEIR\nbenchmarks show that ReT achieves state-of-the-art performance across diverse\nsettings. Our source code and trained models are publicly available at\nhttps://github.com/aimagelab/ReT.\n","authors":["Davide Caffagni","Sara Sarto","Marcella Cornia","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2503.01980v1.pdf","comment":"CVPR 2025"},{"id":"http://arxiv.org/abs/2409.18459v2","updated":"2025-03-03T15:04:18Z","published":"2024-09-27T05:43:22Z","title":"FoodMLLM-JP: Leveraging Multimodal Large Language Models for Japanese\n Recipe Generation","summary":" Research on food image understanding using recipe data has been a\nlong-standing focus due to the diversity and complexity of the data. Moreover,\nfood is inextricably linked to people's lives, making it a vital research area\nfor practical applications such as dietary management. Recent advancements in\nMultimodal Large Language Models (MLLMs) have demonstrated remarkable\ncapabilities, not only in their vast knowledge but also in their ability to\nhandle languages naturally. While English is predominantly used, they can also\nsupport multiple languages including Japanese. This suggests that MLLMs are\nexpected to significantly improve performance in food image understanding\ntasks. We fine-tuned open MLLMs LLaVA-1.5 and Phi-3 Vision on a Japanese recipe\ndataset and benchmarked their performance against the closed model GPT-4o. We\nthen evaluated the content of generated recipes, including ingredients and\ncooking procedures, using 5,000 evaluation samples that comprehensively cover\nJapanese food culture. Our evaluation demonstrates that the open models trained\non recipe data outperform GPT-4o, the current state-of-the-art model, in\ningredient generation. Our model achieved F1 score of 0.531, surpassing\nGPT-4o's F1 score of 0.481, indicating a higher level of accuracy. Furthermore,\nour model exhibited comparable performance to GPT-4o in generating cooking\nprocedure text.\n","authors":["Yuki Imajuku","Yoko Yamakata","Kiyoharu Aizawa"],"pdf_url":"https://arxiv.org/pdf/2409.18459v2.pdf","comment":"15 pages, 5 figures. We found errors in the calculation of evaluation\n metrics, which were corrected in this version with\n $\\color{blue}{\\text{modifications highlighted in blue}}$. Please also see the\n Appendix"},{"id":"http://arxiv.org/abs/2503.01415v1","updated":"2025-03-03T11:10:37Z","published":"2025-03-03T11:10:37Z","title":"Improving the Efficiency of VVC using Partitioning of Reference Frames","summary":" In response to the growing demand for high-quality videos, Versatile Video\nCoding (VVC) was released in 2020, building on the hybrid coding architecture\nof its predecessor, HEVC, achieving about 50% bitrate reduction for the same\nvisual quality. It introduces more flexible block partitioning, enhancing\ncompression efficiency at the cost of increased encoding complexity. To make\nefficient use of VVC in practical applications, optimization is essential.\nVVenC, an optimized open-source VVC encoder, introduces multiple presets to\naddress the trade-off between compression efficiency and encoder complexity.\nAlthough an optimized set of encoding tools has been selected for each preset,\nthe rate-distortion (RD) search space in the encoder presets still poses a\nchallenge for efficient encoder implementations. In this paper, we propose\nEarly Termination using Reference Frames (ETRF), which improves the trade-off\nbetween encoding efficiency and time complexity and positions itself as a new\npreset between medium and fast presets. The CTU partitioning map of the\nreference frames in lower temporal layers is employed to accelerate the\nencoding of frames in higher temporal layers. The results show a reduction in\nthe encoding time of around 21% compared to the medium preset. Specifically,\nfor videos with high spatial and temporal complexities, which typically require\nlonger encoding times, the proposed method achieves a better trade-off between\nbitrate savings and encoding time compared to the fast preset.\n","authors":["Kamran Qureshi","Hadi Amirpour","Christian Timmerer"],"pdf_url":"https://arxiv.org/pdf/2503.01415v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.01404v1","updated":"2025-03-03T10:58:50Z","published":"2025-03-03T10:58:50Z","title":"Multi-resolution Encoding for HTTP Adaptive Streaming using VVenC","summary":" HTTP Adaptive Streaming (HAS) is a widely adopted method for delivering video\ncontent over the Internet, requiring each video to be encoded at multiple\nbitrates and resolution pairs, known as representations, to adapt to various\nnetwork conditions and device capabilities. This multi-bitrate encoding\nintroduces significant challenges due to the computational and time-intensive\nnature of encoding multiple representations. Conventional approaches often\nencode these videos independently without leveraging similarities between\ndifferent representations of the same input video. This paper proposes an\naccelerated multi-resolution encoding strategy that utilizes representations of\nlower resolutions as references to speed up the encoding of higher resolutions\nwhen using Versatile Video Coding (VVC); specifically in VVenC, an optimized\nopen-source software implementation. For multi-resolution encoding, a\nmid-bitrate representation serves as the reference, allowing interpolated\nencoded partition data to efficiently guide the partitioning process in higher\nresolutions. The proposed approach uses shared encoding information to reduce\nredundant calculations, optimizing partitioning decisions. Experimental results\ndemonstrate that the proposed technique achieves a reduction of up to 17%\ncompared to medium preset in encoding time across videos of varying\ncomplexities with minimal BDBR/BDT of 0.12 compared to the fast preset.\n","authors":["Kamran Qureshi","Hadi Amirpour","Christian Timmerer"],"pdf_url":"https://arxiv.org/pdf/2503.01404v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.01396v1","updated":"2025-03-03T10:52:34Z","published":"2025-03-03T10:52:34Z","title":"CorrNetDroid: Android Malware Detector leveraging a Correlation-based\n Feature Selection for Network Traffic features","summary":" Copious mobile operating systems exist in the market, but Android remains the\nuser's choice. Meanwhile, its growing popularity has also attracted malware\ndevelopers. Researchers have proposed various static solutions for Android\nmalware detection. However, stealthier malware evade static analysis. This\nraises the need for a robust Android malware detection system capable of\ndealing with advanced threats and overcoming the shortcomings of static\nanalysis.\n Hence, this work proposes a dynamic analysis-based Android malware detection\nsystem, CorrNetDroid, that works over network traffic flows. Many traffic\nfeatures exhibit overlapping ranges in normal and malware datasets. Therefore,\nwe first rank the features using two statistical measures, crRelevance and\nNormalized Mean Residue Similarity (NMRS), to assess feature-class and\nfeature-feature correlations. Thereafter, we introduce a novel\ncorrelation-based feature selection algorithm that applies NMRS on crRelevance\nrankings to identify the optimal feature subset for Android malware detection.\n Experimental results highlight that our model effectively reduces the feature\nset while detecting Android malware with 99.50 percent accuracy when\nconsidering only two network traffic features. Furthermore, our experiments\ndemonstrate that the NMRS-based algorithm on crRelevance rankings outperforms\nstatistical tests such as chi-square, ANOVA, Mann-Whitney U test, and\nKruskal-Wallis test. In addition, our model surpasses various state-of-the-art\nAndroid malware detection techniques in terms of detection accuracy.\n","authors":["Yash Sharma","Anshul Arora"],"pdf_url":"https://arxiv.org/pdf/2503.01396v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.01370v1","updated":"2025-03-03T10:07:19Z","published":"2025-03-03T10:07:19Z","title":"Kiss3DGen: Repurposing Image Diffusion Models for 3D Asset Generation","summary":" Diffusion models have achieved great success in generating 2D images.\nHowever, the quality and generalizability of 3D content generation remain\nlimited. State-of-the-art methods often require large-scale 3D assets for\ntraining, which are challenging to collect. In this work, we introduce\nKiss3DGen (Keep It Simple and Straightforward in 3D Generation), an efficient\nframework for generating, editing, and enhancing 3D objects by repurposing a\nwell-trained 2D image diffusion model for 3D generation. Specifically, we\nfine-tune a diffusion model to generate ''3D Bundle Image'', a tiled\nrepresentation composed of multi-view images and their corresponding normal\nmaps. The normal maps are then used to reconstruct a 3D mesh, and the\nmulti-view images provide texture mapping, resulting in a complete 3D model.\nThis simple method effectively transforms the 3D generation problem into a 2D\nimage generation task, maximizing the utilization of knowledge in pretrained\ndiffusion models. Furthermore, we demonstrate that our Kiss3DGen model is\ncompatible with various diffusion model techniques, enabling advanced features\nsuch as 3D editing, mesh and texture enhancement, etc. Through extensive\nexperiments, we demonstrate the effectiveness of our approach, showcasing its\nability to produce high-quality 3D models efficiently.\n","authors":["Jiantao Lin","Xin Yang","Meixi Chen","Yingjie Xu","Dongyu Yan","Leyi Wu","Xinli Xu","Lie XU","Shunsi Zhang","Ying-Cong Chen"],"pdf_url":"https://arxiv.org/pdf/2503.01370v1.pdf","comment":"The first three authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2503.01362v1","updated":"2025-03-03T09:55:54Z","published":"2025-03-03T09:55:54Z","title":"Streaming Piano Transcription Based on Consistent Onset and Offset\n Decoding with Sustain Pedal Detection","summary":" This paper describes a streaming audio-to-MIDI piano transcription approach\nthat aims to sequentially translate a music signal into a sequence of note\nonset and offset events. The sequence-to-sequence nature of this task may call\nfor the computationally-intensive transformer model for better performance,\nwhich has recently been used for offline transcription benchmarks and could be\nextended for streaming transcription with causal attention mechanisms. We\nassume that the performance limitation of this naive approach lies in the\ndecoder. Although time-frequency features useful for onset detection are\nconsiderably different from those for offset detection, the single decoder is\ntrained to output a mixed sequence of onset and offset events without guarantee\nof the correspondence between the onset and offset events of the same note. To\novercome this limitation, we propose a streaming encoder-decoder model that\nuses a convolutional encoder aggregating local acoustic features, followed by\nan autoregressive Transformer decoder detecting a variable number of onset\nevents and another decoder detecting the offset events for the active pitches\nwith validation of the sustain pedal at each time frame. Experiments using the\nMAESTRO dataset showed that the proposed streaming method performed comparably\nwith or even better than the state-of-the-art offline methods while\nsignificantly reducing the computational cost.\n","authors":["Weixing Wei","Jiahao Zhao","Yulun Wu","Kazuyoshi Yoshii"],"pdf_url":"https://arxiv.org/pdf/2503.01362v1.pdf","comment":"Accepted to ISMIR 2024"},{"id":"http://arxiv.org/abs/2503.01175v1","updated":"2025-03-03T04:47:39Z","published":"2025-03-03T04:47:39Z","title":"HOP: Heterogeneous Topology-based Multimodal Entanglement for Co-Speech\n Gesture Generation","summary":" Co-speech gestures are crucial non-verbal cues that enhance speech clarity\nand expressiveness in human communication, which have attracted increasing\nattention in multimodal research. While the existing methods have made strides\nin gesture accuracy, challenges remain in generating diverse and coherent\ngestures, as most approaches assume independence among multimodal inputs and\nlack explicit modeling of their interactions. In this work, we propose a novel\nmultimodal learning method named HOP for co-speech gesture generation that\ncaptures the heterogeneous entanglement between gesture motion, audio rhythm,\nand text semantics, enabling the generation of coordinated gestures. By\nleveraging spatiotemporal graph modeling, we achieve the alignment of audio and\naction. Moreover, to enhance modality coherence, we build the audio-text\nsemantic representation based on a reprogramming module, which is beneficial\nfor cross-modality adaptation. Our approach enables the trimodal system to\nlearn each other's features and represent them in the form of topological\nentanglement. Extensive experiments demonstrate that HOP achieves\nstate-of-the-art performance, offering more natural and expressive co-speech\ngesture generation. More information, codes, and demos are available here:\nhttps://star-uu-wang.github.io/HOP/\n","authors":["Hongye Cheng","Tianyu Wang","Guangsi Shi","Zexing Zhao","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2503.01175v1.pdf","comment":"Accepted by CVPR 2025. See https://star-uu-wang.github.io/HOP/"}]},"2025-03-02T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2310.07887v4","updated":"2025-03-02T23:48:32Z","published":"2023-10-11T20:48:20Z","title":"Unsupervised Denoising for Signal-Dependent and Row-Correlated Imaging\n Noise","summary":" Accurate analysis of microscopy images is hindered by the presence of noise.\nThis noise is usually signal-dependent and often additionally correlated along\nrows or columns of pixels. Current self- and unsupervised denoisers can address\nsignal-dependent noise, but none can reliably remove noise that is also row- or\ncolumn-correlated. Here, we present the first fully unsupervised deep\nlearning-based denoiser capable of handling imaging noise that is\nrow-correlated as well as signal-dependent. Our approach uses a Variational\nAutoencoder (VAE) with a specially designed autoregressive decoder. This\ndecoder is capable of modeling row-correlated and signal-dependent noise but is\nincapable of independently modeling underlying clean signal. The VAE therefore\nproduces latent variables containing only clean signal information, and these\nare mapped back into image space using a proposed second decoder network. Our\nmethod does not require a pre-trained noise model and can be trained from\nscratch using unpaired noisy data. We benchmark our approach on microscopy\ndatatsets from a range of imaging modalities and sensor types, each with row-\nor column-correlated, signal-dependent noise, and show that it outperforms\nexisting self- and unsupervised denoisers.\n","authors":["Benjamin Salmon","Alexander Krull"],"pdf_url":"https://arxiv.org/pdf/2310.07887v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15998v2","updated":"2025-03-02T23:41:37Z","published":"2024-08-28T17:59:31Z","title":"Eagle: Exploring The Design Space for Multimodal LLMs with Mixture of\n Encoders","summary":" The ability to accurately interpret complex visual information is a crucial\ntopic of multimodal large language models (MLLMs). Recent work indicates that\nenhanced visual perception significantly reduces hallucinations and improves\nperformance on resolution-sensitive tasks, such as optical character\nrecognition and document analysis. A number of recent MLLMs achieve this goal\nusing a mixture of vision encoders. Despite their success, there is a lack of\nsystematic comparisons and detailed ablation studies addressing critical\naspects, such as expert selection and the integration of multiple vision\nexperts. This study provides an extensive exploration of the design space for\nMLLMs using a mixture of vision encoders and resolutions. Our findings reveal\nseveral underlying principles common to various existing strategies, leading to\na streamlined yet effective design approach. We discover that simply\nconcatenating visual tokens from a set of complementary vision encoders is as\neffective as more complex mixing architectures or strategies. We additionally\nintroduce Pre-Alignment to bridge the gap between vision-focused encoders and\nlanguage tokens, enhancing model coherence. The resulting family of MLLMs,\nEagle, surpasses other leading open-source models on major MLLM benchmarks.\n","authors":["Min Shi","Fuxiao Liu","Shihao Wang","Shijia Liao","Subhashree Radhakrishnan","Yilin Zhao","De-An Huang","Hongxu Yin","Karan Sapra","Yaser Yacoob","Humphrey Shi","Bryan Catanzaro","Andrew Tao","Jan Kautz","Zhiding Yu","Guilin Liu"],"pdf_url":"https://arxiv.org/pdf/2408.15998v2.pdf","comment":"Github: https://github.com/NVlabs/Eagle, HuggingFace:\n https://huggingface.co/NVEagle"},{"id":"http://arxiv.org/abs/2411.01106v2","updated":"2025-03-02T22:41:37Z","published":"2024-11-02T02:09:01Z","title":"SV-RAG: LoRA-Contextualizing Adaptation of MLLMs for Long Document\n Understanding","summary":" Multimodal large language models (MLLMs) have recently shown great progress\nin text-rich image understanding, yet they still struggle with complex,\nmulti-page visually-rich documents. Traditional methods using document parsers\nfor retrieval-augmented generation suffer from performance and efficiency\nlimitations, while directly presenting all pages to MLLMs leads to\ninefficiencies, especially with lengthy ones. In this work, we present a novel\nframework named **S**elf-**V**isual **R**etrieval-**A**ugmented **G**eneration\n(SV-RAG), which can broaden horizons of any MLLM to support long-document\nunderstanding. We demonstrate that **MLLMs themselves can be an effective\nmultimodal retriever** to fetch relevant pages and then answer user questions\nbased on these pages. SV-RAG is implemented with two specific MLLM adapters,\none for evidence page retrieval and the other for question answering. Empirical\nresults show state-of-the-art performance on public benchmarks, demonstrating\nthe effectiveness of SV-RAG.\n","authors":["Jian Chen","Ruiyi Zhang","Yufan Zhou","Tong Yu","Franck Dernoncourt","Jiuxiang Gu","Ryan A. Rossi","Changyou Chen","Tong Sun"],"pdf_url":"https://arxiv.org/pdf/2411.01106v2.pdf","comment":"Accepted to ICLR 2025"},{"id":"http://arxiv.org/abs/2502.00156v2","updated":"2025-03-02T20:53:26Z","published":"2025-01-31T20:47:06Z","title":"ALBAR: Adversarial Learning approach to mitigate Biases in Action\n Recognition","summary":" Bias in machine learning models can lead to unfair decision making, and while\nit has been well-studied in the image and text domains, it remains\nunderexplored in action recognition. Action recognition models often suffer\nfrom background bias (i.e., inferring actions based on background cues) and\nforeground bias (i.e., relying on subject appearance), which can be detrimental\nto real-life applications such as autonomous vehicles or assisted living\nmonitoring. While prior approaches have mainly focused on mitigating background\nbias using specialized augmentations, we thoroughly study both foreground and\nbackground bias. We propose ALBAR, a novel adversarial training method that\nmitigates foreground and background biases without requiring specialized\nknowledge of the bias attributes. Our framework applies an adversarial\ncross-entropy loss to the sampled static clip (where all the frames are the\nsame) and aims to make its class probabilities uniform using a proposed entropy\nmaximization loss. Additionally, we introduce a gradient penalty loss for\nregularization against the debiasing process. We evaluate our method on\nestablished background and foreground bias protocols, setting a new\nstate-of-the-art and strongly improving combined debiasing performance by over\n12% absolute on HMDB51. Furthermore, we identify an issue of background leakage\nin the existing UCF101 protocol for bias evaluation which provides a shortcut\nto predict actions and does not provide an accurate measure of the debiasing\ncapability of a model. We address this issue by proposing more fine-grained\nsegmentation boundaries for the actor, where our method also outperforms\nexisting approaches. Project Page:\nhttps://joefioresi718.github.io/ALBAR_webpage/\n","authors":["Joseph Fioresi","Ishan Rajendrakumar Dave","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2502.00156v2.pdf","comment":"Accepted to ICLR 2025"},{"id":"http://arxiv.org/abs/2312.15289v3","updated":"2025-03-02T18:36:56Z","published":"2023-12-23T16:10:53Z","title":"Fréchet Wavelet Distance: A Domain-Agnostic Metric for Image\n Generation","summary":" Modern metrics for generative learning like Fr\\'echet Inception Distance\n(FID) and DINOv2-Fr\\'echet Distance (FD-DINOv2) demonstrate impressive\nperformance. However, they suffer from various shortcomings, like a bias\ntowards specific generators and datasets. To address this problem, we propose\nthe Fr\\'echet Wavelet Distance (FWD) as a domain-agnostic metric based on the\nWavelet Packet Transform ($W_p$). FWD provides a sight across a broad spectrum\nof frequencies in images with a high resolution, preserving both spatial and\ntextural aspects. Specifically, we use $W_p$ to project generated and real\nimages to the packet coefficient space. We then compute the Fr\\'echet distance\nwith the resultant coefficients to evaluate the quality of a generator. This\nmetric is general-purpose and dataset-domain agnostic, as it does not rely on\nany pre-trained network, while being more interpretable due to its ability to\ncompute Fr\\'echet distance per packet, enhancing transparency. We conclude with\nan extensive evaluation of a wide variety of generators across various datasets\nthat the proposed FWD can generalize and improve robustness to domain shifts\nand various corruptions compared to other metrics.\n","authors":["Lokesh Veeramacheneni","Moritz Wolter","Hildegard Kuehne","Juergen Gall"],"pdf_url":"https://arxiv.org/pdf/2312.15289v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10509v2","updated":"2025-03-02T18:17:14Z","published":"2024-11-15T15:39:04Z","title":"TESGNN: Temporal Equivariant Scene Graph Neural Networks for Efficient\n and Robust Multi-View 3D Scene Understanding","summary":" Scene graphs have proven to be highly effective for various scene\nunderstanding tasks due to their compact and explicit representation of\nrelational information. However, current methods often overlook the critical\nimportance of preserving symmetry when generating scene graphs from 3D point\nclouds, which can lead to reduced accuracy and robustness, particularly when\ndealing with noisy, multi-view data. Furthermore, a major limitation of prior\napproaches is the lack of temporal modeling to capture time-dependent\nrelationships among dynamically evolving entities in a scene. To address these\nchallenges, we propose Temporal Equivariant Scene Graph Neural Network\n(TESGNN), consisting of two key components: (1) an Equivariant Scene Graph\nNeural Network (ESGNN), which extracts information from 3D point clouds to\ngenerate scene graph while preserving crucial symmetry properties, and (2) a\nTemporal Graph Matching Network, which fuses scene graphs generated by ESGNN\nacross multiple time sequences into a unified global representation using an\napproximate graph-matching algorithm. Our combined architecture TESGNN\noutperforms current state-of-the-art methods in scene graph generation,\nachieving higher accuracy and faster training convergence. Moreover, we show\nthat leveraging the symmetry-preserving property produces a more stable and\naccurate global scene representation compared to existing approaches. Last but\nnot least, it is computationally efficient and easily implementable using\nexisting frameworks, making it well-suited for real-time applications in\nrobotics and computer vision. This approach paves the way for more robust and\nscalable solutions to complex multi-view scene understanding challenges. Our\nsource code is publicly available at: https://github.com/HySonLab/TESGraph\n","authors":["Quang P. M. Pham","Khoi T. N. Nguyen","Lan C. Ngo","Truong Do","Dezhen Song","Truong-Son Hy"],"pdf_url":"https://arxiv.org/pdf/2411.10509v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2407.00609"},{"id":"http://arxiv.org/abs/2411.02372v2","updated":"2025-03-02T17:34:53Z","published":"2024-11-04T18:40:46Z","title":"Learning General-Purpose Biomedical Volume Representations using\n Randomized Synthesis","summary":" Current volumetric biomedical foundation models struggle to generalize as\npublic 3D datasets are small and do not cover the broad diversity of medical\nprocedures, conditions, anatomical regions, and imaging protocols. We address\nthis by creating a representation learning method that instead anticipates\nstrong domain shifts at training time itself. We first propose a data engine\nthat synthesizes highly variable training samples that would enable\ngeneralization to new biomedical contexts. To then train a single 3D network\nfor any voxel-level task, we develop a contrastive learning method that\npretrains the network to be stable against nuisance imaging variation simulated\nby the data engine, a key inductive bias for generalization. This network's\nfeatures can be used as robust representations of input images for downstream\ntasks and its weights provide a strong, dataset-agnostic initialization for\nfinetuning on new datasets. As a result, we set new standards across both\nmultimodality registration and few-shot segmentation, a first for any 3D\nbiomedical vision model, all without (pre-)training on any existing dataset of\nreal images.\n","authors":["Neel Dey","Benjamin Billot","Hallee E. Wong","Clinton J. Wang","Mengwei Ren","P. Ellen Grant","Adrian V. Dalca","Polina Golland"],"pdf_url":"https://arxiv.org/pdf/2411.02372v2.pdf","comment":"ICLR 2025: International Conference on Learning Representations. Code\n and model weights available at https://github.com/neel-dey/anatomix.\n Keywords: synthetic data, representation learning, medical image analysis,\n image registration, image segmentation"},{"id":"http://arxiv.org/abs/2409.14876v3","updated":"2025-03-02T17:27:04Z","published":"2024-09-23T10:17:13Z","title":"Tri-Clustering: A Multi-views Tri-level Information Fusion Context\n Clustering Framework for Localization and Classification in Mammography","summary":" Breast cancer is a significant global health issue, and the diagnosis of\nbreast imaging has always been challenging. Mammography images typically have\nextremely high resolution, with lesions occupying only a very small area.\nDown-sampling in neural networks can easily lead to the loss of\nmicrocalcifications or subtle structures, making it difficult for traditional\nneural network architectures to address these issues. To tackle these\nchallenges, we propose a Context Clustering Network with triple information\nfusion. Firstly, compared to CNNs or transformers, we find that Context\nclustering methods (1) are more computationally efficient and (2) can more\neasily associate structural or pathological features, making them suitable for\nthe clinical tasks of mammography. Secondly, we propose a triple information\nfusion mechanism that integrates global information, feature-based local\ninformation, and patch-based local information. The proposed approach is\nrigorously evaluated on two public datasets, Vindr-Mammo and CBIS-DDSM, using\nfive independent splits to ensure statistical robustness. Our method achieves\nan AUC of 0.828 on Vindr-Mammo and 0.805 on CBIS-DDSM, outperforming the next\nbest method by 3.1% and 2.4%, respectively. These improvements are\nstatistically significant (p<0.05), underscoring the benefits of Context\nClustering Network with triple information fusion. Overall, our Context\nClustering framework demonstrates strong potential as a scalable and\ncost-effective solution for large-scale mammography screening, enabling more\nefficient and accurate breast cancer detection. Access to our method is\navailable at https://github.com/Sohyu1/Mammo_Clustering.\n","authors":["Shilong Yang","Chulong Zhang","Qi Zang","Juan Yu","Liang Zeng","Xiao Luo","Yexuan Xing","Xin Pan","Qi Li","Xiaokun Liang","Yaoqin Xie"],"pdf_url":"https://arxiv.org/pdf/2409.14876v3.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2410.04810v2","updated":"2025-03-02T17:18:04Z","published":"2024-10-07T07:45:18Z","title":"FedBiP: Heterogeneous One-Shot Federated Learning with Personalized\n Latent Diffusion Models","summary":" One-Shot Federated Learning (OSFL), a special decentralized machine learning\nparadigm, has recently gained significant attention. OSFL requires only a\nsingle round of client data or model upload, which reduces communication costs\nand mitigates privacy threats compared to traditional FL. Despite these\npromising prospects, existing methods face challenges due to client data\nheterogeneity and limited data quantity when applied to real-world OSFL\nsystems. Recently, Latent Diffusion Models (LDM) have shown remarkable\nadvancements in synthesizing high-quality images through pretraining on\nlarge-scale datasets, thereby presenting a potential solution to overcome these\nissues. However, directly applying pretrained LDM to heterogeneous OSFL results\nin significant distribution shifts in synthetic data, leading to performance\ndegradation in classification models trained on such data. This issue is\nparticularly pronounced in rare domains, such as medical imaging, which are\nunderrepresented in LDM's pretraining data. To address this challenge, we\npropose Federated Bi-Level Personalization (FedBiP), which personalizes the\npretrained LDM at both instance-level and concept-level. Hereby, FedBiP\nsynthesizes images following the client's local data distribution without\ncompromising the privacy regulations. FedBiP is also the first approach to\nsimultaneously address feature space heterogeneity and client data scarcity in\nOSFL. Our method is validated through extensive experiments on three OSFL\nbenchmarks with feature space heterogeneity, as well as on challenging medical\nand satellite image datasets with label heterogeneity. The results demonstrate\nthe effectiveness of FedBiP, which substantially outperforms other OSFL\nmethods.\n","authors":["Haokun Chen","Hang Li","Yao Zhang","Jinhe Bi","Gengyuan Zhang","Yueqi Zhang","Philip Torr","Jindong Gu","Denis Krompass","Volker Tresp"],"pdf_url":"https://arxiv.org/pdf/2410.04810v2.pdf","comment":"CVPR 2025"},{"id":"http://arxiv.org/abs/2308.09036v2","updated":"2025-03-02T17:15:30Z","published":"2023-08-17T15:17:49Z","title":"Synthesizing Physically Plausible Human Motions in 3D Scenes","summary":" We present a physics-based character control framework for synthesizing\nhuman-scene interactions. Recent advances adopt physics simulation to mitigate\nartifacts produced by data-driven kinematic approaches. However, existing\nphysics-based methods mainly focus on single-object environments, resulting in\nlimited applicability in realistic 3D scenes with multi-objects. To address\nsuch challenges, we propose a framework that enables physically simulated\ncharacters to perform long-term interaction tasks in diverse, cluttered, and\nunseen 3D scenes. The key idea is to decouple human-scene interactions into two\nfundamental processes, Interacting and Navigating, which motivates us to\nconstruct two reusable Controllers, namely InterCon and NavCon. Specifically,\nInterCon uses two complementary policies to enable characters to enter or leave\nthe interacting state with a particular object (e.g., sitting on a chair or\ngetting up). To realize navigation in cluttered environments, we introduce\nNavCon, where a trajectory following policy enables characters to track\npre-planned collision-free paths. Benefiting from the divide and conquer\nstrategy, we can train all policies in simple environments and directly apply\nthem in complex multi-object scenes through coordination from a rule-based\nscheduler. Video and code are available at\nhttps://github.com/liangpan99/InterScene.\n","authors":["Liang Pan","Jingbo Wang","Buzhen Huang","Junyu Zhang","Haofan Wang","Xu Tang","Yangang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.09036v2.pdf","comment":"3DV 2024 version"},{"id":"http://arxiv.org/abs/2410.15744v2","updated":"2025-03-02T16:58:17Z","published":"2024-10-21T08:01:58Z","title":"Unleashing the Potential of Vision-Language Pre-Training for 3D\n Zero-Shot Lesion Segmentation via Mask-Attribute Alignment","summary":" Recent advancements in medical vision-language pre-training models have\ndriven significant progress in zero-shot disease recognition. However,\ntransferring image-level knowledge to pixel-level tasks, such as lesion\nsegmentation in 3D CT scans, remains a critical challenge. Due to the\ncomplexity and variability of pathological visual characteristics, existing\nmethods struggle to align fine-grained lesion features not encountered during\ntraining with disease-related textual representations. In this paper, we\npresent Malenia, a novel multi-scale lesion-level mask-attribute alignment\nframework, specifically designed for 3D zero-shot lesion segmentation. Malenia\nimproves the compatibility between mask representations and their associated\nelemental attributes, explicitly linking the visual features of unseen lesions\nwith the extensible knowledge learned from previously seen ones. Furthermore,\nwe design a Cross-Modal Knowledge Injection module to enhance both visual and\ntextual features with mutually beneficial information, effectively guiding the\ngeneration of segmentation results. Comprehensive experiments across three\ndatasets and 12 lesion categories validate the superior performance of Malenia.\n","authors":["Yankai Jiang","Wenhui Lei","Xiaofan Zhang","Shaoting Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.15744v2.pdf","comment":"Accepted as ICLR 2025 conference paper"},{"id":"http://arxiv.org/abs/2403.18035v4","updated":"2025-03-02T16:41:49Z","published":"2024-03-26T18:40:36Z","title":"Bidirectional Consistency Models","summary":" Diffusion models (DMs) are capable of generating remarkably high-quality\nsamples by iteratively denoising a random vector, a process that corresponds to\nmoving along the probability flow ordinary differential equation (PF ODE).\nInterestingly, DMs can also invert an input image to noise by moving backward\nalong the PF ODE, a key operation for downstream tasks such as interpolation\nand image editing. However, the iterative nature of this process restricts its\nspeed, hindering its broader application. Recently, Consistency Models (CMs)\nhave emerged to address this challenge by approximating the integral of the PF\nODE, largely reducing the number of iterations. Yet, the absence of an explicit\nODE solver complicates the inversion process. To resolve this, we introduce\nBidirectional Consistency Model (BCM), which learns a single neural network\nthat enables both forward and backward traversal along the PF ODE, efficiently\nunifying generation and inversion tasks within one framework. We can train BCM\nfrom scratch or tune it using a pretrained consistency model, which reduces the\ntraining cost and increases scalability. We demonstrate that BCM enables\none-step generation and inversion while also allowing the use of additional\nsteps to enhance generation quality or reduce reconstruction error. We further\nshowcase BCM's capability in downstream tasks, such as interpolation and\ninpainting. Our code and weights are available at\nhttps://github.com/Mosasaur5526/BCM-iCT-torch.\n","authors":["Liangchen Li","Jiajun He"],"pdf_url":"https://arxiv.org/pdf/2403.18035v4.pdf","comment":"39 pages, 27 figures; a shorter version of this paper was acceppted\n at the ICML 2024 Workshop on Structured Probabilistic Inference & Generative\n Modeling"},{"id":"http://arxiv.org/abs/2408.11915v2","updated":"2025-03-02T15:55:14Z","published":"2024-08-21T18:06:15Z","title":"Video-Foley: Two-Stage Video-To-Sound Generation via Temporal Event\n Condition For Foley Sound","summary":" Foley sound synthesis is crucial for multimedia production, enhancing user\nexperience by synchronizing audio and video both temporally and semantically.\nRecent studies on automating this labor-intensive process through\nvideo-to-sound generation face significant challenges. Systems lacking explicit\ntemporal features suffer from poor alignment and controllability, while\ntimestamp-based models require costly and subjective human annotation. We\npropose Video-Foley, a video-to-sound system using Root Mean Square (RMS) as an\nintuitive condition with semantic timbre prompts (audio or text). RMS, a\nframe-level intensity envelope closely related to audio semantics, acts as a\ntemporal event feature to guide audio generation from video. The\nannotation-free self-supervised learning framework consists of two stages,\nVideo2RMS and RMS2Sound, incorporating novel ideas including RMS discretization\nand RMS-ControlNet with a pretrained text-to-audio model. Our extensive\nevaluation shows that Video-Foley achieves state-of-the-art performance in\naudio-visual alignment and controllability for sound timing, intensity, timbre,\nand nuance. Source code, model weights and demos are available on our companion\nwebsite. (https://jnwnlee.github.io/video-foley-demo)\n","authors":["Junwon Lee","Jaekwon Im","Dabin Kim","Juhan Nam"],"pdf_url":"https://arxiv.org/pdf/2408.11915v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03895v2","updated":"2025-03-02T15:55:07Z","published":"2025-01-07T16:03:14Z","title":"LLaVA-Mini: Efficient Image and Video Large Multimodal Models with One\n Vision Token","summary":" The advent of real-time large multimodal models (LMMs) like GPT-4o has\nsparked considerable interest in efficient LMMs. LMM frameworks typically\nencode visual inputs into vision tokens (continuous representations) and\nintegrate them and textual instructions into the context of large language\nmodels (LLMs), where large-scale parameters and numerous context tokens\n(predominantly vision tokens) result in substantial computational overhead.\nPrevious efforts towards efficient LMMs always focus on replacing the LLM\nbackbone with smaller models, while neglecting the crucial issue of token\nquantity. In this paper, we introduce LLaVA-Mini, an efficient LMM with minimal\nvision tokens. To achieve a high compression ratio of vision tokens while\npreserving visual information, we first analyze how LMMs understand vision\ntokens and find that most vision tokens only play a crucial role in the early\nlayers of LLM backbone, where they mainly fuse visual information into text\ntokens. Building on this finding, LLaVA-Mini introduces modality pre-fusion to\nfuse visual information into text tokens in advance, thereby facilitating the\nextreme compression of vision tokens fed to LLM backbone into one token.\nLLaVA-Mini is a unified large multimodal model that can support the\nunderstanding of images, high-resolution images, and videos in an efficient\nmanner. Experiments across 11 image-based and 7 video-based benchmarks\ndemonstrate that LLaVA-Mini outperforms LLaVA-v1.5 with just 1 vision token\ninstead of 576. Efficiency analyses reveal that LLaVA-Mini can reduce FLOPs by\n77%, deliver low-latency responses within 40 milliseconds, and process over\n10,000 frames of video on the GPU hardware with 24GB of memory.\n","authors":["Shaolei Zhang","Qingkai Fang","Zhe Yang","Yang Feng"],"pdf_url":"https://arxiv.org/pdf/2501.03895v2.pdf","comment":"Accepted to ICLR 2025. Code: https://github.com/ictnlp/LLaVA-Mini\n Model: https://huggingface.co/ICTNLP/llava-mini-llama-3.1-8b"},{"id":"http://arxiv.org/abs/2501.18672v3","updated":"2025-03-02T15:43:39Z","published":"2025-01-30T18:51:54Z","title":"Drag Your Gaussian: Effective Drag-Based Editing with Score Distillation\n for 3D Gaussian Splatting","summary":" Recent advancements in 3D scene editing have been propelled by the rapid\ndevelopment of generative models. Existing methods typically utilize generative\nmodels to perform text-guided editing on 3D representations, such as 3D\nGaussian Splatting (3DGS). However, these methods are often limited to texture\nmodifications and fail when addressing geometric changes, such as editing a\ncharacter's head to turn around. Moreover, such methods lack accurate control\nover the spatial position of editing results, as language struggles to\nprecisely describe the extent of edits. To overcome these limitations, we\nintroduce DYG, an effective 3D drag-based editing method for 3D Gaussian\nSplatting. It enables users to conveniently specify the desired editing region\nand the desired dragging direction through the input of 3D masks and pairs of\ncontrol points, thereby enabling precise control over the extent of editing.\nDYG integrates the strengths of the implicit triplane representation to\nestablish the geometric scaffold of the editing results, effectively overcoming\nsuboptimal editing outcomes caused by the sparsity of 3DGS in the desired\nediting regions. Additionally, we incorporate a drag-based Latent Diffusion\nModel into our method through the proposed Drag-SDS loss function, enabling\nflexible, multi-view consistent, and fine-grained editing. Extensive\nexperiments demonstrate that DYG conducts effective drag-based editing guided\nby control point prompts, surpassing other baselines in terms of editing effect\nand quality, both qualitatively and quantitatively. Visit our project page at\nhttps://quyans.github.io/Drag-Your-Gaussian.\n","authors":["Yansong Qu","Dian Chen","Xinyang Li","Xiaofan Li","Shengchuan Zhang","Liujuan Cao","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2501.18672v3.pdf","comment":"Visit our project page at https://quyans.github.io/Drag-Your-Gaussian"},{"id":"http://arxiv.org/abs/2310.18709v4","updated":"2025-03-02T15:37:39Z","published":"2023-10-28T13:37:52Z","title":"Audio-Visual Instance Segmentation","summary":" In this paper, we propose a new multi-modal task, termed audio-visual\ninstance segmentation (AVIS), which aims to simultaneously identify, segment\nand track individual sounding object instances in audible videos. To facilitate\nthis research, we introduce a high-quality benchmark named AVISeg, containing\nover 90K instance masks from 26 semantic categories in 926 long videos.\nAdditionally, we propose a strong baseline model for this task. Our model first\nlocalizes sound source within each frame, and condenses object-specific\ncontexts into concise tokens. Then it builds long-range audio-visual\ndependencies between these tokens using window-based attention, and tracks\nsounding objects among the entire video sequences. Extensive experiments reveal\nthat our method performs best on AVISeg, surpassing the existing methods from\nrelated tasks. We further conduct the evaluation on several multi-modal large\nmodels. Unfortunately, they exhibits subpar performance on instance-level sound\nsource localization and temporal perception. We expect that AVIS will inspire\nthe community towards a more comprehensive multi-modal understanding. Dataset\nand code is available at https://github.com/ruohaoguo/avis.\n","authors":["Ruohao Guo","Xianghua Ying","Yaru Chen","Dantong Niu","Guangyao Li","Liao Qu","Yanyu Qi","Jinxing Zhou","Bowei Xing","Wenzhen Yue","Ji Shi","Qixun Wang","Peiliang Zhang","Buwen Liang"],"pdf_url":"https://arxiv.org/pdf/2310.18709v4.pdf","comment":"Accepted by CVPR 2025"},{"id":"http://arxiv.org/abs/2409.03114v2","updated":"2025-03-02T15:30:06Z","published":"2024-09-04T22:39:02Z","title":"Evaluating Low-Resource Lane Following Algorithms for\n Compute-Constrained Automated Vehicles","summary":" Reliable lane-following is essential for automated and assisted driving, yet\nexisting solutions often rely on models that require extensive computational\nresources, limiting their deployment in compute-constrained vehicles. We\nevaluate five low-resource lane-following algorithms designed for real-time\noperation on vehicles with limited computing resources. Performance was\nassessed through simulation and deployment on real drive-by-wire electric\nvehicles, with evaluation metrics including reliability, comfort, speed, and\nadaptability. The top-performing methods used unsupervised learning to detect\nand separate lane lines with processing time under 10 ms per frame,\noutperforming compute-intensive and poor generalizing deep learning approaches.\nThese approaches demonstrated robustness across lighting conditions, road\ntextures, and lane geometries. The findings highlight the potential for\nefficient lane detection approaches to enhance the accessibility and\nreliability of autonomous vehicle technologies. Reducing computing requirements\nenables lane keeping to be widely deployed in vehicles as part of lower-level\nautomation, including active safety systems.\n","authors":["Beñat Froemming-Aldanondo","Tatiana Rastoskueva","Michael Evans","Marcial Machado","Anna Vadella","Rickey Johnson","Luis Escamilla","Milan Jostes","Devson Butani","Ryan Kaddis","Chan-Jin Chung","Joshua Siegel"],"pdf_url":"https://arxiv.org/pdf/2409.03114v2.pdf","comment":"Supported by the National Science Foundation under Grants No. 2150292\n and 2150096"},{"id":"http://arxiv.org/abs/2410.03878v2","updated":"2025-03-02T15:22:12Z","published":"2024-10-04T19:22:20Z","title":"SPARTUN3D: Situated Spatial Understanding of 3D World in Large Language\n Models","summary":" Integrating the 3D world into large language models (3D-based LLMs) has been\na promising research direction for 3D scene understanding. However, current\n3D-based LLMs fall short in situated understanding due to two key limitations:\n1) existing 3D datasets are constructed from a global perspective of the 3D\nscenes and lack situated context. 2) the architectures of existing 3D-based\nLLMs lack explicit alignment between the spatial representations of 3D scenes\nand natural language, limiting their performance in tasks requiring precise\nspatial reasoning. We address these issues by introducing a scalable situated\n3D dataset, named Spartun3D, that incorporates various situated spatial\nreasoning tasks. Furthermore, we propose Spartun3D-LLM, built on an existing\n3D-based LLM but integrated with a novel situated spatial alignment module,\naiming to enhance the alignment between 3D visual representations and their\ncorresponding textual descriptions. Experimental results demonstrate that both\nour proposed dataset and alignment module significantly enhance the situated\nspatial understanding of 3D-based LLMs.\n","authors":["Yue Zhang","Zhiyang Xu","Ying Shen","Parisa Kordjamshidi","Lifu Huang"],"pdf_url":"https://arxiv.org/pdf/2410.03878v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13426v2","updated":"2025-03-02T15:06:51Z","published":"2024-09-20T11:46:48Z","title":"HMD^2: Environment-aware Motion Generation from Single Egocentric\n Head-Mounted Device","summary":" This paper investigates the generation of realistic full-body human motion\nusing a single head-mounted device with an outward-facing color camera and the\nability to perform visual SLAM. To address the ambiguity of this setup, we\npresent HMD^2, a novel system that balances motion reconstruction and\ngeneration. From a reconstruction standpoint, it aims to maximally utilize the\ncamera streams to produce both analytical and learned features, including head\nmotion, SLAM point cloud, and image embeddings. On the generative front, HMD^2\nemploys a multi-modal conditional motion diffusion model with a Transformer\nbackbone to maintain temporal coherence of generated motions, and utilizes\nautoregressive inpainting to facilitate online motion inference with minimal\nlatency (0.17 seconds). We show that our system provides an effective and\nrobust solution that scales to a diverse dataset of over 200 hours of motion in\ncomplex indoor and outdoor environments.\n","authors":["Vladimir Guzov","Yifeng Jiang","Fangzhou Hong","Gerard Pons-Moll","Richard Newcombe","C. Karen Liu","Yuting Ye","Lingni Ma"],"pdf_url":"https://arxiv.org/pdf/2409.13426v2.pdf","comment":"International Conference on 3D Vision 2025 (3DV 2025)"},{"id":"http://arxiv.org/abs/2502.11858v3","updated":"2025-03-02T14:14:07Z","published":"2025-02-17T14:50:34Z","title":"Rethinking Audio-Visual Adversarial Vulnerability from Temporal and\n Modality Perspectives","summary":" While audio-visual learning equips models with a richer understanding of the\nreal world by leveraging multiple sensory modalities, this integration also\nintroduces new vulnerabilities to adversarial attacks.\n In this paper, we present a comprehensive study of the adversarial robustness\nof audio-visual models, considering both temporal and modality-specific\nvulnerabilities. We propose two powerful adversarial attacks: 1) a temporal\ninvariance attack that exploits the inherent temporal redundancy across\nconsecutive time segments and 2) a modality misalignment attack that introduces\nincongruence between the audio and visual modalities. These attacks are\ndesigned to thoroughly assess the robustness of audio-visual models against\ndiverse threats. Furthermore, to defend against such attacks, we introduce a\nnovel audio-visual adversarial training framework. This framework addresses key\nchallenges in vanilla adversarial training by incorporating efficient\nadversarial perturbation crafting tailored to multi-modal data and an\nadversarial curriculum strategy. Extensive experiments in the Kinetics-Sounds\ndataset demonstrate that our proposed temporal and modality-based attacks in\ndegrading model performance can achieve state-of-the-art performance, while our\nadversarial training defense largely improves the adversarial robustness as\nwell as the adversarial training efficiency.\n","authors":["Zeliang Zhang","Susan Liang","Daiki Shimada","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2502.11858v3.pdf","comment":"Accepted by ICLR 2025"},{"id":"http://arxiv.org/abs/2410.02094v3","updated":"2025-03-02T14:04:22Z","published":"2024-10-02T23:30:05Z","title":"Tracking objects that change in appearance with phase synchrony","summary":" Objects we encounter often change appearance as we interact with them.\nChanges in illumination (shadows), object pose, or the movement of non-rigid\nobjects can drastically alter available image features. How do biological\nvisual systems track objects as they change? One plausible mechanism involves\nattentional mechanisms for reasoning about the locations of objects\nindependently of their appearances -- a capability that prominent neuroscience\ntheories have associated with computing through neural synchrony. Here, we\ndescribe a novel deep learning circuit that can learn to precisely control\nattention to features separately from their location in the world through\nneural synchrony: the complex-valued recurrent neural network (CV-RNN). Next,\nwe compare object tracking in humans, the CV-RNN, and other deep neural\nnetworks (DNNs), using FeatureTracker: a large-scale challenge that asks\nobservers to track objects as their locations and appearances change in\nprecisely controlled ways. While humans effortlessly solved FeatureTracker,\nstate-of-the-art DNNs did not. In contrast, our CV-RNN behaved similarly to\nhumans on the challenge, providing a computational proof-of-concept for the\nrole of phase synchronization as a neural substrate for tracking\nappearance-morphing objects as they move about.\n","authors":["Sabine Muzellec","Drew Linsley","Alekh K. Ashok","Ennio Mingolla","Girik Malik","Rufin VanRullen","Thomas Serre"],"pdf_url":"https://arxiv.org/pdf/2410.02094v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.19047v2","updated":"2025-03-02T13:52:23Z","published":"2025-02-26T11:01:43Z","title":"A Dual-Purpose Framework for Backdoor Defense and Backdoor Amplification\n in Diffusion Models","summary":" Diffusion models have emerged as state-of-the-art generative frameworks,\nexcelling in producing high-quality multi-modal samples. However, recent\nstudies have revealed their vulnerability to backdoor attacks, where backdoored\nmodels generate specific, undesirable outputs called backdoor target (e.g.,\nharmful images) when a pre-defined trigger is embedded to their inputs. In this\npaper, we propose PureDiffusion, a dual-purpose framework that simultaneously\nserves two contrasting roles: backdoor defense and backdoor attack\namplification. For defense, we introduce two novel loss functions to invert\nbackdoor triggers embedded in diffusion models. The first leverages\ntrigger-induced distribution shifts across multiple timesteps of the diffusion\nprocess, while the second exploits the denoising consistency effect when a\nbackdoor is activated. Once an accurate trigger inversion is achieved, we\ndevelop a backdoor detection method that analyzes both the inverted trigger and\nthe generated backdoor targets to identify backdoor attacks. In terms of attack\namplification with the role of an attacker, we describe how our trigger\ninversion algorithm can be used to reinforce the original trigger embedded in\nthe backdoored diffusion model. This significantly boosts attack performance\nwhile reducing the required backdoor training time. Experimental results\ndemonstrate that PureDiffusion achieves near-perfect detection accuracy,\noutperforming existing defenses by a large margin, particularly against complex\ntrigger patterns. Additionally, in an attack scenario, our attack amplification\napproach elevates the attack success rate (ASR) of existing backdoor attacks to\nnearly 100\\% while reducing training time by up to 20x.\n","authors":["Vu Tuan Truong","Long Bao Le"],"pdf_url":"https://arxiv.org/pdf/2502.19047v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15161v2","updated":"2025-03-02T13:49:21Z","published":"2024-04-23T16:01:33Z","title":"Test-Time Adaptation for Combating Missing Modalities in Egocentric\n Videos","summary":" Understanding videos that contain multiple modalities is crucial, especially\nin egocentric videos, where combining various sensory inputs significantly\nimproves tasks like action recognition and moment localization. However,\nreal-world applications often face challenges with incomplete modalities due to\nprivacy concerns, efficiency needs, or hardware issues. Current methods, while\neffective, often necessitate retraining the model entirely to handle missing\nmodalities, making them computationally intensive, particularly with large\ntraining datasets. In this study, we propose a novel approach to address this\nissue at test time without requiring retraining. We frame the problem as a\ntest-time adaptation task, where the model adjusts to the available unlabeled\ndata at test time. Our method, MiDl~(Mutual information with\nself-Distillation), encourages the model to be insensitive to the specific\nmodality source present during testing by minimizing the mutual information\nbetween the prediction and the available modality. Additionally, we incorporate\nself-distillation to maintain the model's original performance when both\nmodalities are available. MiDl represents the first self-supervised, online\nsolution for handling missing modalities exclusively at test time. Through\nexperiments with various pretrained models and datasets, MiDl demonstrates\nsubstantial performance improvement without the need for retraining.\n","authors":["Merey Ramazanova","Alejandro Pardo","Bernard Ghanem","Motasem Alfarra"],"pdf_url":"https://arxiv.org/pdf/2404.15161v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.20209v2","updated":"2025-03-02T13:36:57Z","published":"2025-02-27T15:50:21Z","title":"DIPSER: A Dataset for In-Person Student Engagement Recognition in the\n Wild","summary":" In this paper, a novel dataset is introduced, designed to assess student\nattention within in-person classroom settings. This dataset encompasses RGB\ncamera data, featuring multiple cameras per student to capture both posture and\nfacial expressions, in addition to smartwatch sensor data for each individual.\nThis dataset allows machine learning algorithms to be trained to predict\nattention and correlate it with emotion. A comprehensive suite of attention and\nemotion labels for each student is provided, generated through self-reporting\nas well as evaluations by four different experts. Our dataset uniquely combines\nfacial and environmental camera data, smartwatch metrics, and includes\nunderrepresented ethnicities in similar datasets, all within in-the-wild,\nin-person settings, making it the most comprehensive dataset of its kind\ncurrently available.\n The dataset presented offers an extensive and diverse collection of data\npertaining to student interactions across different educational contexts,\naugmented with additional metadata from other tools. This initiative addresses\nexisting deficiencies by offering a valuable resource for the analysis of\nstudent attention and emotion in face-to-face lessons.\n","authors":["Luis Marquez-Carpintero","Sergio Suescun-Ferrandiz","Carolina Lorenzo Álvarez","Jorge Fernandez-Herrero","Diego Viejo","Rosabel Roig-Vila","Miguel Cazorla"],"pdf_url":"https://arxiv.org/pdf/2502.20209v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.02820v2","updated":"2025-03-02T13:14:11Z","published":"2022-08-04T02:22:29Z","title":"MOVE: Effective and Harmless Ownership Verification via Embedded\n External Features","summary":" Currently, deep neural networks (DNNs) are widely adopted in different\napplications. Despite its commercial values, training a well-performing DNN is\nresource-consuming. Accordingly, the well-trained model is valuable\nintellectual property for its owner. However, recent studies revealed the\nthreats of model stealing, where the adversaries can obtain a function-similar\ncopy of the victim model, even when they can only query the model. In this\npaper, we propose an effective and harmless model ownership verification (MOVE)\nto defend against different types of model stealing simultaneously, without\nintroducing new security risks. In general, we conduct the ownership\nverification by verifying whether a suspicious model contains the knowledge of\ndefender-specified external features. Specifically, we embed the external\nfeatures by modifying a few training samples with style transfer. We then train\na meta-classifier to determine whether a model is stolen from the victim. This\napproach is inspired by the understanding that the stolen models should contain\nthe knowledge of features learned by the victim model. In particular,\n\\revision{we develop our MOVE method under both white-box and black-box\nsettings and analyze its theoretical foundation to provide comprehensive model\nprotection.} Extensive experiments on benchmark datasets verify the\neffectiveness of our method and its resistance to potential adaptive attacks.\nThe codes for reproducing the main experiments of our method are available at\nhttps://github.com/THUYimingLi/MOVE.\n","authors":["Yiming Li","Linghui Zhu","Xiaojun Jia","Yang Bai","Yong Jiang","Shu-Tao Xia","Xiaochun Cao","Kui Ren"],"pdf_url":"https://arxiv.org/pdf/2208.02820v2.pdf","comment":"This paper has been accepted by IEEE TPAMI 2025. It is the journal\n extension of our conference paper in AAAI 2022\n (https://ojs.aaai.org/index.php/AAAI/article/view/20036). 18 pages"},{"id":"http://arxiv.org/abs/2502.18461v2","updated":"2025-03-02T12:44:06Z","published":"2025-02-25T18:59:12Z","title":"K-LoRA: Unlocking Training-Free Fusion of Any Subject and Style LoRAs","summary":" Recent studies have explored combining different LoRAs to jointly generate\nlearned style and content. However, existing methods either fail to effectively\npreserve both the original subject and style simultaneously or require\nadditional training. In this paper, we argue that the intrinsic properties of\nLoRA can effectively guide diffusion models in merging learned subject and\nstyle. Building on this insight, we propose K-LoRA, a simple yet effective\ntraining-free LoRA fusion approach. In each attention layer, K-LoRA compares\nthe Top-K elements in each LoRA to be fused, determining which LoRA to select\nfor optimal fusion. This selection mechanism ensures that the most\nrepresentative features of both subject and style are retained during the\nfusion process, effectively balancing their contributions. Experimental results\ndemonstrate that the proposed method effectively integrates the subject and\nstyle information learned by the original LoRAs, outperforming state-of-the-art\ntraining-based approaches in both qualitative and quantitative results.\n","authors":["Ziheng Ouyang","Zhen Li","Qibin Hou"],"pdf_url":"https://arxiv.org/pdf/2502.18461v2.pdf","comment":"CVPR 2025, Project page: https://k-lora.github.io/K-LoRA.io/"},{"id":"http://arxiv.org/abs/2406.15812v2","updated":"2025-03-02T12:28:24Z","published":"2024-06-22T10:36:04Z","title":"Intrinsic Dimension Correlation: uncovering nonlinear connections in\n multimodal representations","summary":" To gain insight into the mechanisms behind machine learning methods, it is\ncrucial to establish connections among the features describing data points.\nHowever, these correlations often exhibit a high-dimensional and strongly\nnonlinear nature, which makes them challenging to detect using standard\nmethods. This paper exploits the entanglement between intrinsic dimensionality\nand correlation to propose a metric that quantifies the (potentially nonlinear)\ncorrelation between high-dimensional manifolds. We first validate our method on\nsynthetic data in controlled environments, showcasing its advantages and\ndrawbacks compared to existing techniques. Subsequently, we extend our analysis\nto large-scale applications in neural network representations. Specifically, we\nfocus on latent representations of multimodal data, uncovering clear\ncorrelations between paired visual and textual embeddings, whereas existing\nmethods struggle significantly in detecting similarity. Our results indicate\nthe presence of highly nonlinear correlation patterns between latent manifolds.\n","authors":["Lorenzo Basile","Santiago Acevedo","Luca Bortolussi","Fabio Anselmi","Alex Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2406.15812v2.pdf","comment":"Accepted at ICLR 2025"},{"id":"http://arxiv.org/abs/2409.20063v2","updated":"2025-03-02T12:17:51Z","published":"2024-09-30T08:05:00Z","title":"Q-Bench-Video: Benchmarking the Video Quality Understanding of LMMs","summary":" With the rising interest in research on Large Multi-modal Models (LMMs) for\nvideo understanding, many studies have emphasized general video comprehension\ncapabilities, neglecting the systematic exploration into video quality\nunderstanding. To address this oversight, we introduce Q-Bench-Video in this\npaper, a new benchmark specifically designed to evaluate LMMs' proficiency in\ndiscerning video quality. a) To ensure video source diversity, Q-Bench-Video\nencompasses videos from natural scenes, AI-generated Content (AIGC), and\nComputer Graphics (CG). b) Building on the traditional multiple-choice\nquestions format with the Yes-or-No and What-How categories, we include\nOpen-ended questions to better evaluate complex scenarios. Additionally, we\nincorporate the video pair quality comparison question to enhance\ncomprehensiveness. c) Beyond the traditional Technical, Aesthetic, and Temporal\ndistortions, we have expanded our evaluation aspects to include the dimension\nof AIGC distortions, which addresses the increasing demand for video\ngeneration. Finally, we collect a total of 2,378 question-answer pairs and test\nthem on 12 open-source & 5 proprietary LMMs. Our findings indicate that while\nLMMs have a foundational understanding of video quality, their performance\nremains incomplete and imprecise, with a notable discrepancy compared to human\nperformance. Through Q-Bench-Video, we seek to catalyze community interest,\nstimulate further research, and unlock the untapped potential of LMMs to close\nthe gap in video quality understanding.\n","authors":["Zicheng Zhang","Ziheng Jia","Haoning Wu","Chunyi Li","Zijian Chen","Yingjie Zhou","Wei Sun","Xiaohong Liu","Xiongkuo Min","Weisi Lin","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2409.20063v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08470v2","updated":"2025-03-02T11:52:31Z","published":"2024-11-13T09:42:12Z","title":"HyperFace: Generating Synthetic Face Recognition Datasets by Exploring\n Face Embedding Hypersphere","summary":" Face recognition datasets are often collected by crawling Internet and\nwithout individuals' consents, raising ethical and privacy concerns. Generating\nsynthetic datasets for training face recognition models has emerged as a\npromising alternative. However, the generation of synthetic datasets remains\nchallenging as it entails adequate inter-class and intra-class variations.\nWhile advances in generative models have made it easier to increase intra-class\nvariations in face datasets (such as pose, illumination, etc.), generating\nsufficient inter-class variation is still a difficult task. In this paper, we\nformulate the dataset generation as a packing problem on the embedding space\n(represented on a hypersphere) of a face recognition model and propose a new\nsynthetic dataset generation approach, called HyperFace. We formalize our\npacking problem as an optimization problem and solve it with a gradient\ndescent-based approach. Then, we use a conditional face generator model to\nsynthesize face images from the optimized embeddings. We use our generated\ndatasets to train face recognition models and evaluate the trained models on\nseveral benchmarking real datasets. Our experimental results show that models\ntrained with HyperFace achieve state-of-the-art performance in training face\nrecognition using synthetic datasets.\n","authors":["Hatef Otroshi Shahreza","Sébastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2411.08470v2.pdf","comment":"Accepted in ICLR 2025"},{"id":"http://arxiv.org/abs/2408.09886v3","updated":"2025-03-02T11:32:04Z","published":"2024-08-19T11:01:00Z","title":"Improved Baselines with Synchronized Encoding for Universal Medical\n Image Segmentation","summary":" Large foundation models, known for their strong zero-shot generalization\ncapabilities, can be applied to a wide range of downstream tasks. However,\ndeveloping foundation models for medical image segmentation poses a significant\nchallenge due to the domain gap between natural and medical images. While\nfine-tuning techniques based on the Segment Anything Model (SAM) have been\nexplored, they primarily focus on scaling up data or refining inference\nstrategies without incorporating domain-specific architectural designs,\nlimiting their zero-shot performance. To optimize segmentation performance\nunder standard inference settings and provide a strong baseline for future\nresearch, we introduce SyncSAM, which employs a synchronized dual-branch\nencoder that integrates convolution and Transformer features in a synchronized\nmanner to enhance medical image encoding, and a multi-scale dual-branch decoder\nto preserve image details. SyncSAM is trained on two of the largest medical\nimage segmentation datasets, SA-Med2D-20M and IMed-361M, resulting in a series\nof pre-trained models for universal medical image segmentation. Experimental\nresults demonstrate that SyncSAM not only achieves state-of-the-art performance\non test sets but also exhibits strong zero-shot capabilities on unseen\ndatasets. The code and model weights are available at\nhttps://github.com/Hhankyangg/SyncSAM.\n","authors":["Sihan Yang","Xuande Mi","Jiadong Feng","Haixia Bi","Hai Zhang","Jian Sun"],"pdf_url":"https://arxiv.org/pdf/2408.09886v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15445v2","updated":"2025-03-02T11:16:08Z","published":"2025-01-26T08:22:44Z","title":"StochSync: Stochastic Diffusion Synchronization for Image Generation in\n Arbitrary Spaces","summary":" We propose a zero-shot method for generating images in arbitrary spaces\n(e.g., a sphere for 360{\\deg} panoramas and a mesh surface for texture) using a\npretrained image diffusion model. The zero-shot generation of various visual\ncontent using a pretrained image diffusion model has been explored mainly in\ntwo directions. First, Diffusion Synchronization-performing reverse diffusion\nprocesses jointly across different projected spaces while synchronizing them in\nthe target space-generates high-quality outputs when enough conditioning is\nprovided, but it struggles in its absence. Second, Score Distillation\nSampling-gradually updating the target space data through gradient\ndescent-results in better coherence but often lacks detail. In this paper, we\nreveal for the first time the interconnection between these two methods while\nhighlighting their differences. To this end, we propose StochSync, a novel\napproach that combines the strengths of both, enabling effective performance\nwith weak conditioning. Our experiments demonstrate that StochSync provides the\nbest performance in 360{\\deg} panorama generation (where image conditioning is\nnot given), outperforming previous finetuning-based methods, and also delivers\ncomparable results in 3D mesh texturing (where depth conditioning is provided)\nwith previous methods.\n","authors":["Kyeongmin Yeo","Jaihoon Kim","Minhyuk Sung"],"pdf_url":"https://arxiv.org/pdf/2501.15445v2.pdf","comment":"Project page: https://stochsync.github.io/ (ICLR 2025)"},{"id":"http://arxiv.org/abs/2410.05260v2","updated":"2025-03-02T10:58:06Z","published":"2024-10-07T17:58:22Z","title":"DartControl: A Diffusion-Based Autoregressive Motion Model for Real-Time\n Text-Driven Motion Control","summary":" Text-conditioned human motion generation, which allows for user interaction\nthrough natural language, has become increasingly popular. Existing methods\ntypically generate short, isolated motions based on a single input sentence.\nHowever, human motions are continuous and can extend over long periods,\ncarrying rich semantics. Creating long, complex motions that precisely respond\nto streams of text descriptions, particularly in an online and real-time\nsetting, remains a significant challenge. Furthermore, incorporating spatial\nconstraints into text-conditioned motion generation presents additional\nchallenges, as it requires aligning the motion semantics specified by text\ndescriptions with geometric information, such as goal locations and 3D scene\ngeometry. To address these limitations, we propose DartControl, in short DART,\na Diffusion-based Autoregressive motion primitive model for Real-time\nText-driven motion control. Our model effectively learns a compact motion\nprimitive space jointly conditioned on motion history and text inputs using\nlatent diffusion models. By autoregressively generating motion primitives based\non the preceding history and current text input, DART enables real-time,\nsequential motion generation driven by natural language descriptions.\nAdditionally, the learned motion primitive space allows for precise spatial\nmotion control, which we formulate either as a latent noise optimization\nproblem or as a Markov decision process addressed through reinforcement\nlearning. We present effective algorithms for both approaches, demonstrating\nour model's versatility and superior performance in various motion synthesis\ntasks. Experiments show our method outperforms existing baselines in motion\nrealism, efficiency, and controllability. Video results are available on the\nproject page: https://zkf1997.github.io/DART/.\n","authors":["Kaifeng Zhao","Gen Li","Siyu Tang"],"pdf_url":"https://arxiv.org/pdf/2410.05260v2.pdf","comment":"Updated ICLR camera ready version"},{"id":"http://arxiv.org/abs/2402.04236v3","updated":"2025-03-02T09:39:57Z","published":"2024-02-06T18:43:48Z","title":"CogCoM: A Visual Language Model with Chain-of-Manipulations Reasoning","summary":" Vision-Language Models (VLMs) have demonstrated their broad effectiveness\nthanks to extensive training in aligning visual instructions to responses.\nHowever, such training of conclusive alignment leads models to ignore essential\nvisual reasoning, further resulting in failures in meticulous visual problems\nand unfaithful responses. Drawing inspiration from human cognition in solving\nvisual problems (e.g., marking, zoom in), this paper introduces Chain of\nManipulations, a mechanism that enables VLMs to solve problems step-by-step\nwith evidence. After training, models can solve various visual problems by\neliciting intrinsic manipulations (e.g., grounding, zoom in) with results\n(e.g., boxes, image) actively without involving external tools, while also\nallowing users to trace error causes. We study the roadmap to implement this\nmechanism, including (1) a flexible design of manipulations upon extensive\nanalysis, (2) an efficient automated data generation pipeline, (3) a compatible\nVLM architecture capable of multi-turn multi-image, and (4) a model training\nprocess for versatile capabilities. With the design, we also manually annotate\n6K high-quality samples for the challenging graphical mathematical problems.\nOur trained model, \\textbf{CogCoM}, equipped with this mechanism with 17B\nparameters achieves state-of-the-art performance across 9 benchmarks from 4\ncategories, demonstrating the effectiveness while preserving the\ninterpretability. Our code, model weights, and collected data are publicly\navailable at https://github.com/THUDM/CogCoM.\n","authors":["Ji Qi","Ming Ding","Weihan Wang","Yushi Bai","Qingsong Lv","Wenyi Hong","Bin Xu","Lei Hou","Juanzi Li","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2402.04236v3.pdf","comment":"21 pages, 10 figures"},{"id":"http://arxiv.org/abs/2502.18176v2","updated":"2025-03-02T09:22:47Z","published":"2025-02-25T13:09:34Z","title":"CLIPure: Purification in Latent Space via CLIP for Adversarially Robust\n Zero-Shot Classification","summary":" In this paper, we aim to build an adversarially robust zero-shot image\nclassifier. We ground our work on CLIP, a vision-language pre-trained encoder\nmodel that can perform zero-shot classification by matching an image with text\nprompts ``a photo of a .''. Purification is the path we choose\nsince it does not require adversarial training on specific attack types and\nthus can cope with any foreseen attacks. We then formulate purification risk as\nthe KL divergence between the joint distributions of the purification process\nof denoising the adversarial samples and the attack process of adding\nperturbations to benign samples, through bidirectional Stochastic Differential\nEquations (SDEs). The final derived results inspire us to explore purification\nin the multi-modal latent space of CLIP. We propose two variants for our\nCLIPure approach: CLIPure-Diff which models the likelihood of images' latent\nvectors with the DiffusionPrior module in DaLLE-2 (modeling the generation\nprocess of CLIP's latent vectors), and CLIPure-Cos which models the likelihood\nwith the cosine similarity between the embeddings of an image and ``a photo of\na.''. As far as we know, CLIPure is the first purification method in\nmulti-modal latent space and CLIPure-Cos is the first purification method that\nis not based on generative models, which substantially improves defense\nefficiency. We conducted extensive experiments on CIFAR-10, ImageNet, and 13\ndatasets that previous CLIP-based defense methods used for evaluating zero-shot\nclassification robustness. Results show that CLIPure boosts the SOTA robustness\nby a large margin, e.g., from 71.7% to 91.1% on CIFAR10, from 59.6% to 72.6% on\nImageNet, and 108% relative improvements of average robustness on the 13\ndatasets over previous SOTA. The code is available at\nhttps://github.com/TMLResearchGroup-CAS/CLIPure.\n","authors":["Mingkun Zhang","Keping Bi","Wei Chen","Jiafeng Guo","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2502.18176v2.pdf","comment":"accepted by ICLR 2025"},{"id":"http://arxiv.org/abs/2412.09765v2","updated":"2025-03-02T09:21:27Z","published":"2024-12-12T23:57:01Z","title":"L-WISE: Boosting Human Visual Category Learning Through Model-Based\n Image Selection And Enhancement","summary":" The currently leading artificial neural network models of the visual ventral\nstream - which are derived from a combination of performance optimization and\nrobustification methods - have demonstrated a remarkable degree of behavioral\nalignment with humans on visual categorization tasks. We show that image\nperturbations generated by these models can enhance the ability of humans to\naccurately report the ground truth class. Furthermore, we find that the same\nmodels can also be used out-of-the-box to predict the proportion of correct\nhuman responses to individual images, providing a simple, human-aligned\nestimator of the relative difficulty of each image. Motivated by these\nobservations, we propose to augment visual learning in humans in a way that\nimproves human categorization accuracy at test time. Our learning augmentation\napproach consists of (i) selecting images based on their model-estimated\nrecognition difficulty, and (ii) applying image perturbations that aid\nrecognition for novice learners. We find that combining these model-based\nstrategies leads to categorization accuracy gains of 33-72% relative to control\nsubjects without these interventions, on unmodified, randomly selected held-out\ntest images. Beyond the accuracy gain, the training time for the augmented\nlearning group was also shortened by 20-23%, despite both groups completing the\nsame number of training trials. We demonstrate the efficacy of our approach in\na fine-grained categorization task with natural images, as well as two tasks in\nclinically relevant image domains - histology and dermoscopy - where visual\nlearning is notoriously challenging. To the best of our knowledge, our work is\nthe first application of artificial neural networks to increase visual learning\nperformance in humans by enhancing category-specific image features.\n","authors":["Morgan B. Talbot","Gabriel Kreiman","James J. DiCarlo","Guy Gaziv"],"pdf_url":"https://arxiv.org/pdf/2412.09765v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06751v2","updated":"2025-03-02T09:10:13Z","published":"2025-01-12T08:36:38Z","title":"Padding Tone: A Mechanistic Analysis of Padding Tokens in T2I Models","summary":" Text-to-image (T2I) diffusion models rely on encoded prompts to guide the\nimage generation process. Typically, these prompts are extended to a fixed\nlength by adding padding tokens before text encoding. Despite being a default\npractice, the influence of padding tokens on the image generation process has\nnot been investigated. In this work, we conduct the first in-depth analysis of\nthe role padding tokens play in T2I models. We develop two causal techniques to\nanalyze how information is encoded in the representation of tokens across\ndifferent components of the T2I pipeline. Using these techniques, we\ninvestigate when and how padding tokens impact the image generation process.\nOur findings reveal three distinct scenarios: padding tokens may affect the\nmodel's output during text encoding, during the diffusion process, or be\neffectively ignored. Moreover, we identify key relationships between these\nscenarios and the model's architecture (cross or self-attention) and its\ntraining process (frozen or trained text encoder). These insights contribute to\na deeper understanding of the mechanisms of padding tokens, potentially\ninforming future model design and training practices in T2I systems.\n","authors":["Michael Toker","Ido Galil","Hadas Orgad","Rinon Gal","Yoad Tewel","Gal Chechik","Yonatan Belinkov"],"pdf_url":"https://arxiv.org/pdf/2501.06751v2.pdf","comment":"Published in: NAACL 2025. Project webpage:\n https://padding-tone.github.io/"},{"id":"http://arxiv.org/abs/2410.06614v2","updated":"2025-03-02T08:59:29Z","published":"2024-10-09T07:09:46Z","title":"Pair-VPR: Place-Aware Pre-training and Contrastive Pair Classification\n for Visual Place Recognition with Vision Transformers","summary":" In this work we propose a novel joint training method for Visual Place\nRecognition (VPR), which simultaneously learns a global descriptor and a pair\nclassifier for re-ranking. The pair classifier can predict whether a given pair\nof images are from the same place or not. The network only comprises Vision\nTransformer components for both the encoder and the pair classifier, and both\ncomponents are trained using their respective class tokens. In existing VPR\nmethods, typically the network is initialized using pre-trained weights from a\ngeneric image dataset such as ImageNet. In this work we propose an alternative\npre-training strategy, by using Siamese Masked Image Modelling as a\npre-training task. We propose a Place-aware image sampling procedure from a\ncollection of large VPR datasets for pre-training our model, to learn visual\nfeatures tuned specifically for VPR. By re-using the Mask Image Modelling\nencoder and decoder weights in the second stage of training, Pair-VPR can\nachieve state-of-the-art VPR performance across five benchmark datasets with a\nViT-B encoder, along with further improvements in localization recall with\nlarger encoders. The Pair-VPR website is:\nhttps://csiro-robotics.github.io/Pair-VPR.\n","authors":["Stephen Hausler","Peyman Moghadam"],"pdf_url":"https://arxiv.org/pdf/2410.06614v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.20092v2","updated":"2025-03-02T08:56:15Z","published":"2025-02-27T13:51:56Z","title":"WalnutData: A UAV Remote Sensing Dataset of Green Walnuts and Model\n Evaluation","summary":" The UAV technology is gradually maturing and can provide extremely powerful\nsupport for smart agriculture and precise monitoring. Currently, there is no\ndataset related to green walnuts in the field of agricultural computer vision.\nThus, in order to promote the algorithm design in the field of agricultural\ncomputer vision, we used UAV to collect remote-sensing data from 8 walnut\nsample plots. Considering that green walnuts are subject to various lighting\nconditions and occlusion, we constructed a large-scale dataset with a\nhigher-granularity of target features - WalnutData. This dataset contains a\ntotal of 30,240 images and 706,208 instances, and there are 4 target\ncategories: being illuminated by frontal light and unoccluded (A1), being\nbacklit and unoccluded (A2), being illuminated by frontal light and occluded\n(B1), and being backlit and occluded (B2). Subsequently, we evaluated many\nmainstream algorithms on WalnutData and used these evaluation results as the\nbaseline standard. The dataset and all evaluation results can be obtained at\nhttps://github.com/1wuming/WalnutData.\n","authors":["Mingjie Wu","Chenggui Yang","Huihua Wang","Chen Xue","Yibo Wang","Haoyu Wang","Yansong Wang","Can Peng","Yuqi Han","Ruoyu Li","Lijun Yun","Zaiqing Chen","Songfan Shi","Luhao Fang","Shuyi Wan","Tingfeng Li","Shuangyao Liu","Haotian Feng"],"pdf_url":"https://arxiv.org/pdf/2502.20092v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.14808v2","updated":"2025-03-02T08:53:47Z","published":"2024-11-22T09:08:58Z","title":"High-Resolution Image Synthesis via Next-Token Prediction","summary":" Recently, autoregressive models have demonstrated remarkable performance in\nclass-conditional image generation. However, the application of next-token\nprediction to high-resolution text-to-image generation remains largely\nunexplored. In this paper, we introduce \\textbf{D-JEPA$\\cdot$T2I}, an\nautoregressive model based on continuous tokens that incorporates innovations\nin both architecture and training strategy to generate high-quality,\nphotorealistic images at arbitrary resolutions, up to 4K. Architecturally, we\nadopt the denoising joint embedding predictive architecture (D-JEPA) while\nleveraging a multimodal visual transformer to effectively integrate textual and\nvisual features. Additionally, we introduce flow matching loss alongside the\nproposed Visual Rotary Positional Embedding (VoPE) to enable continuous\nresolution learning. In terms of training strategy, we propose a data feedback\nmechanism that dynamically adjusts the sampling procedure based on statistical\nanalysis and an online learning critic model. This encourages the model to move\nbeyond its comfort zone, reducing redundant training on well-mastered scenarios\nand compelling it to address more challenging cases with suboptimal generation\nquality. For the first time, we achieve state-of-the-art high-resolution image\nsynthesis via next-token prediction.\n","authors":["Dengsheng Chen","Jie Hu","Tiezhu Yue","Xiaoming Wei","Enhua Wu"],"pdf_url":"https://arxiv.org/pdf/2411.14808v2.pdf","comment":"31 pages"},{"id":"http://arxiv.org/abs/2411.03990v2","updated":"2025-03-02T08:11:34Z","published":"2024-11-06T15:30:42Z","title":"ET-SEED: Efficient Trajectory-Level SE(3) Equivariant Diffusion Policy","summary":" Imitation learning, e.g., diffusion policy, has been proven effective in\nvarious robotic manipulation tasks. However, extensive demonstrations are\nrequired for policy robustness and generalization. To reduce the demonstration\nreliance, we leverage spatial symmetry and propose ET-SEED, an efficient\ntrajectory-level SE(3) equivariant diffusion model for generating action\nsequences in complex robot manipulation tasks. Further, previous equivariant\ndiffusion models require the per-step equivariance in the Markov process,\nmaking it difficult to learn policy under such strong constraints. We\ntheoretically extend equivariant Markov kernels and simplify the condition of\nequivariant diffusion process, thereby significantly improving training\nefficiency for trajectory-level SE(3) equivariant diffusion policy in an\nend-to-end manner. We evaluate ET-SEED on representative robotic manipulation\ntasks, involving rigid body, articulated and deformable object. Experiments\ndemonstrate superior data efficiency and manipulation proficiency of our\nproposed method, as well as its ability to generalize to unseen configurations\nwith only a few demonstrations. Website: https://et-seed.github.io/\n","authors":["Chenrui Tie","Yue Chen","Ruihai Wu","Boxuan Dong","Zeyi Li","Chongkai Gao","Hao Dong"],"pdf_url":"https://arxiv.org/pdf/2411.03990v2.pdf","comment":"Accept to ICLR 2025"},{"id":"http://arxiv.org/abs/2412.14169v2","updated":"2025-03-02T08:09:39Z","published":"2024-12-18T18:59:53Z","title":"Autoregressive Video Generation without Vector Quantization","summary":" This paper presents a novel approach that enables autoregressive video\ngeneration with high efficiency. We propose to reformulate the video generation\nproblem as a non-quantized autoregressive modeling of temporal frame-by-frame\nprediction and spatial set-by-set prediction. Unlike raster-scan prediction in\nprior autoregressive models or joint distribution modeling of fixed-length\ntokens in diffusion models, our approach maintains the causal property of\nGPT-style models for flexible in-context capabilities, while leveraging\nbidirectional modeling within individual frames for efficiency. With the\nproposed approach, we train a novel video autoregressive model without vector\nquantization, termed NOVA. Our results demonstrate that NOVA surpasses prior\nautoregressive video models in data efficiency, inference speed, visual\nfidelity, and video fluency, even with a much smaller model capacity, i.e.,\n0.6B parameters. NOVA also outperforms state-of-the-art image diffusion models\nin text-to-image generation tasks, with a significantly lower training cost.\nAdditionally, NOVA generalizes well across extended video durations and enables\ndiverse zero-shot applications in one unified model. Code and models are\npublicly available at https://github.com/baaivision/NOVA.\n","authors":["Haoge Deng","Ting Pan","Haiwen Diao","Zhengxiong Luo","Yufeng Cui","Huchuan Lu","Shiguang Shan","Yonggang Qi","Xinlong Wang"],"pdf_url":"https://arxiv.org/pdf/2412.14169v2.pdf","comment":"Accepted to ICLR 2025. Project page at\n https://github.com/baaivision/NOVA"},{"id":"http://arxiv.org/abs/2404.14396v2","updated":"2025-03-02T07:53:44Z","published":"2024-04-22T17:56:09Z","title":"SEED-X: Multimodal Models with Unified Multi-granularity Comprehension\n and Generation","summary":" The rapid evolution of multimodal foundation model has demonstrated\nsignificant progresses in vision-language understanding and generation, e.g.,\nour previous work SEED-LLaMA. However, there remains a gap between its\ncapability and the real-world applicability, primarily due to the model's\nlimited capacity to effectively respond to various user instructions and\ninteract with diverse visual data. In this work, we focus on bridging this gap\nthrough integrating two enhanced features: (1) comprehending images of\narbitrary sizes and ratios, and (2) enabling multi-granularity image\ngeneration. We present a unified and versatile foundation model, namely,\nSEED-X, which is able to model multi-granularity visual semantics for\ncomprehension and generation tasks. Besides the competitive results on public\nbenchmarks, SEED-X demonstrates its effectiveness in handling real-world\napplications across various domains after instruction tuning. We hope that our\nwork will inspire future research into what can be achieved by versatile\nmultimodal foundation models in real-world applications. The models, codes, and\ndatasets are released in https://github.com/AILab-CVC/SEED-X.\n","authors":["Yuying Ge","Sijie Zhao","Jinguo Zhu","Yixiao Ge","Kun Yi","Lin Song","Chen Li","Xiaohan Ding","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2404.14396v2.pdf","comment":"We added benchmark results (without updating models) and ablation\n study in this version. Project released at:\n https://github.com/AILab-CVC/SEED-X"},{"id":"http://arxiv.org/abs/2405.20986v2","updated":"2025-03-02T07:46:05Z","published":"2024-05-31T16:32:46Z","title":"Predictive Uncertainty Quantification for Bird's Eye View Segmentation:\n A Benchmark and Novel Loss Function","summary":" The fusion of raw sensor data to create a Bird's Eye View (BEV)\nrepresentation is critical for autonomous vehicle planning and control. Despite\nthe growing interest in using deep learning models for BEV semantic\nsegmentation, anticipating segmentation errors and enhancing the explainability\nof these models remain underexplored. This paper introduces a comprehensive\nbenchmark for predictive uncertainty quantification in BEV segmentation,\nevaluating multiple uncertainty quantification methods across three popular\ndatasets with three representative network architectures. Our study focuses on\nthe effectiveness of quantified uncertainty in detecting misclassified and\nout-of-distribution (OOD) pixels while also improving model calibration.\nThrough empirical analysis, we uncover challenges in existing uncertainty\nquantification methods and demonstrate the potential of evidential deep\nlearning techniques, which capture both aleatoric and epistemic uncertainty. To\naddress these challenges, we propose a novel loss function,\nUncertainty-Focal-Cross-Entropy (UFCE), specifically designed for highly\nimbalanced data, along with a simple uncertainty-scaling regularization term\nthat improves both uncertainty quantification and model calibration for BEV\nsegmentation.\n","authors":["Linlin Yu","Bowen Yang","Tianhao Wang","Kangshuo Li","Feng Chen"],"pdf_url":"https://arxiv.org/pdf/2405.20986v2.pdf","comment":"ICLR 2025"},{"id":"http://arxiv.org/abs/2410.03355v3","updated":"2025-03-02T07:45:09Z","published":"2024-10-04T12:21:03Z","title":"LANTERN: Accelerating Visual Autoregressive Models with Relaxed\n Speculative Decoding","summary":" Auto-Regressive (AR) models have recently gained prominence in image\ngeneration, often matching or even surpassing the performance of diffusion\nmodels. However, one major limitation of AR models is their sequential nature,\nwhich processes tokens one at a time, slowing down generation compared to\nmodels like GANs or diffusion-based methods that operate more efficiently.\nWhile speculative decoding has proven effective for accelerating LLMs by\ngenerating multiple tokens in a single forward, its application in visual AR\nmodels remains largely unexplored. In this work, we identify a challenge in\nthis setting, which we term \\textit{token selection ambiguity}, wherein visual\nAR models frequently assign uniformly low probabilities to tokens, hampering\nthe performance of speculative decoding. To overcome this challenge, we propose\na relaxed acceptance condition referred to as LANTERN that leverages the\ninterchangeability of tokens in latent space. This relaxation restores the\neffectiveness of speculative decoding in visual AR models by enabling more\nflexible use of candidate tokens that would otherwise be prematurely rejected.\nFurthermore, by incorporating a total variation distance bound, we ensure that\nthese speed gains are achieved without significantly compromising image quality\nor semantic coherence. Experimental results demonstrate the efficacy of our\nmethod in providing a substantial speed-up over speculative decoding. In\nspecific, compared to a na\\\"ive application of the state-of-the-art speculative\ndecoding, LANTERN increases speed-ups by $\\mathbf{1.75}\\times$ and\n$\\mathbf{1.82}\\times$, as compared to greedy decoding and random sampling,\nrespectively, when applied to LlamaGen, a contemporary visual AR model. The\ncode is publicly available at https://github.com/jadohu/LANTERN.\n","authors":["Doohyuk Jang","Sihwan Park","June Yong Yang","Yeonsung Jung","Jihun Yun","Souvik Kundu","Sung-Yub Kim","Eunho Yang"],"pdf_url":"https://arxiv.org/pdf/2410.03355v3.pdf","comment":"30 pages, 13 figures, Accepted to ICLR 2025 (poster)"},{"id":"http://arxiv.org/abs/2410.10010v3","updated":"2025-03-02T07:42:20Z","published":"2024-10-13T21:11:04Z","title":"InterMask: 3D Human Interaction Generation via Collaborative Masked\n Modeling","summary":" Generating realistic 3D human-human interactions from textual descriptions\nremains a challenging task. Existing approaches, typically based on diffusion\nmodels, often produce results lacking realism and fidelity. In this work, we\nintroduce InterMask, a novel framework for generating human interactions using\ncollaborative masked modeling in discrete space. InterMask first employs a\nVQ-VAE to transform each motion sequence into a 2D discrete motion token map.\nUnlike traditional 1D VQ token maps, it better preserves fine-grained\nspatio-temporal details and promotes spatial awareness within each token.\nBuilding on this representation, InterMask utilizes a generative masked\nmodeling framework to collaboratively model the tokens of two interacting\nindividuals. This is achieved by employing a transformer architecture\nspecifically designed to capture complex spatio-temporal inter-dependencies.\nDuring training, it randomly masks the motion tokens of both individuals and\nlearns to predict them. For inference, starting from fully masked sequences, it\nprogressively fills in the tokens for both individuals. With its enhanced\nmotion representation, dedicated architecture, and effective learning strategy,\nInterMask achieves state-of-the-art results, producing high-fidelity and\ndiverse human interactions. It outperforms previous methods, achieving an FID\nof $5.154$ (vs $5.535$ of in2IN) on the InterHuman dataset and $0.399$ (vs\n$5.207$ of InterGen) on the InterX dataset. Additionally, InterMask seamlessly\nsupports reaction generation without the need for model redesign or\nfine-tuning.\n","authors":["Muhammad Gohar Javed","Chuan Guo","Li Cheng","Xingyu Li"],"pdf_url":"https://arxiv.org/pdf/2410.10010v3.pdf","comment":"Project webpage: https://gohar-malik.github.io/intermask"},{"id":"http://arxiv.org/abs/2409.19835v2","updated":"2025-03-02T07:32:50Z","published":"2024-09-30T00:17:00Z","title":"MoCoLSK: Modality Conditioned High-Resolution Downscaling for Land\n Surface Temperature","summary":" Land Surface Temperature (LST) is a critical parameter for environmental\nstudies, but directly obtaining high spatial resolution LST data remains\nchallenging due to the spatio-temporal trade-off in satellite remote sensing.\nGuided LST downscaling has emerged as an alternative solution to overcome these\nlimitations, but current methods often neglect spatial non-stationarity, and\nthere is a lack of an open-source ecosystem for deep learning methods. In this\npaper, we propose the Modality-Conditional Large Selective Kernel (MoCoLSK)\nNetwork, a novel architecture that dynamically fuses multi-modal data through\nmodality-conditioned projections. MoCoLSK achieves a confluence of dynamic\nreceptive field adjustment and multi-modal feature fusion, leading to enhanced\nLST prediction accuracy. Furthermore, we establish the GrokLST project, a\ncomprehensive open-source ecosystem featuring the GrokLST dataset, a\nhigh-resolution benchmark, and the GrokLST toolkit, an open-source\nPyTorch-based toolkit encapsulating MoCoLSK alongside 40+ state-of-the-art\napproaches. Extensive experimental results validate MoCoLSK's effectiveness in\ncapturing complex dependencies and subtle variations within multispectral data,\noutperforming existing methods in LST downscaling. Our code, dataset, and\ntoolkit are available at https://github.com/GrokCV/GrokLST.\n","authors":["Qun Dai","Chunyang Yuan","Yimian Dai","Yuxuan Li","Xiang Li","Kang Ni","Jianhui Xu","Xiangbo Shu","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2409.19835v2.pdf","comment":"Accepted by IEEE TGRS"},{"id":"http://arxiv.org/abs/2502.10982v3","updated":"2025-03-02T07:31:57Z","published":"2025-02-16T04:00:06Z","title":"TEASER: Token Enhanced Spatial Modeling for Expressions Reconstruction","summary":" 3D facial reconstruction from a single in-the-wild image is a crucial task in\nhuman-centered computer vision tasks. While existing methods can recover\naccurate facial shapes, there remains significant space for improvement in\nfine-grained expression capture. Current approaches struggle with irregular\nmouth shapes, exaggerated expressions, and asymmetrical facial movements. We\npresent TEASER (Token EnhAnced Spatial modeling for Expressions\nReconstruction), which addresses these challenges and enhances 3D facial\ngeometry performance. TEASER tackles two main limitations of existing methods:\ninsufficient photometric loss for self-reconstruction and inaccurate\nlocalization of subtle expressions. We introduce a multi-scale tokenizer to\nextract facial appearance information. Combined with a neural renderer, these\ntokens provide precise geometric guidance for expression reconstruction.\nFurthermore, TEASER incorporates a pose-dependent landmark loss to further\nimprove geometric performances. Our approach not only significantly enhances\nexpression reconstruction quality but also offers interpretable tokens suitable\nfor various downstream applications, such as photorealistic facial video\ndriving, expression transfer, and identity swapping. Quantitative and\nqualitative experimental results across multiple datasets demonstrate that\nTEASER achieves state-of-the-art performance in precise expression\nreconstruction.\n","authors":["Yunfei Liu","Lei Zhu","Lijian Lin","Ye Zhu","Ailing Zhang","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2502.10982v3.pdf","comment":"Accepted by ICLR 2025, code and demos are available at\n https://tinyurl.com/TEASER-project"},{"id":"http://arxiv.org/abs/2501.19069v2","updated":"2025-03-02T07:22:57Z","published":"2025-01-31T11:55:17Z","title":"Improving vision-language alignment with graph spiking hybrid Networks","summary":" To bridge the semantic gap between vision and language (VL), it is necessary\nto develop a good alignment strategy, which includes handling semantic\ndiversity, abstract representation of visual information, and generalization\nability of models. Recent works use detector-based bounding boxes or patches\nwith regular partitions to represent visual semantics. While current paradigms\nhave made strides, they are still insufficient for fully capturing the nuanced\ncontextual relations among various objects. This paper proposes a comprehensive\nvisual semantic representation module, necessitating the utilization of\npanoptic segmentation to generate coherent fine-grained semantic features.\nFurthermore, we propose a novel Graph Spiking Hybrid Network (GSHN) that\nintegrates the complementary advantages of Spiking Neural Networks (SNNs) and\nGraph Attention Networks (GATs) to encode visual semantic information.\nIntriguingly, the model not only encodes the discrete and continuous latent\nvariables of instances but also adeptly captures both local and global\ncontextual features, thereby significantly enhancing the richness and diversity\nof semantic representations. Leveraging the spatiotemporal properties inherent\nin SNNs, we employ contrastive learning (CL) to enhance the similarity-based\nrepresentation of embeddings. This strategy alleviates the computational\noverhead of the model and enriches meaningful visual representations by\nconstructing positive and negative sample pairs. We design an innovative\npre-training method, Spiked Text Learning (STL), which uses text features to\nimprove the encoding ability of discrete semantics. Experiments show that the\nproposed GSHN exhibits promising results on multiple VL downstream tasks.\n","authors":["Siyu Zhang","Wenzhe Liu","Yeming Chen","Yiming Wu","Heming Zheng","Cheng Cheng"],"pdf_url":"https://arxiv.org/pdf/2501.19069v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.11817v2","updated":"2025-03-02T07:05:19Z","published":"2024-10-15T17:46:31Z","title":"Improving Long-Text Alignment for Text-to-Image Diffusion Models","summary":" The rapid advancement of text-to-image (T2I) diffusion models has enabled\nthem to generate unprecedented results from given texts. However, as text\ninputs become longer, existing encoding methods like CLIP face limitations, and\naligning the generated images with long texts becomes challenging. To tackle\nthese issues, we propose LongAlign, which includes a segment-level encoding\nmethod for processing long texts and a decomposed preference optimization\nmethod for effective alignment training. For segment-level encoding, long texts\nare divided into multiple segments and processed separately. This method\novercomes the maximum input length limits of pretrained encoding models. For\npreference optimization, we provide decomposed CLIP-based preference models to\nfine-tune diffusion models. Specifically, to utilize CLIP-based preference\nmodels for T2I alignment, we delve into their scoring mechanisms and find that\nthe preference scores can be decomposed into two components: a text-relevant\npart that measures T2I alignment and a text-irrelevant part that assesses other\nvisual aspects of human preference. Additionally, we find that the\ntext-irrelevant part contributes to a common overfitting problem during\nfine-tuning. To address this, we propose a reweighting strategy that assigns\ndifferent weights to these two components, thereby reducing overfitting and\nenhancing alignment. After fine-tuning $512 \\times 512$ Stable Diffusion (SD)\nv1.5 for about 20 hours using our method, the fine-tuned SD outperforms\nstronger foundation models in T2I alignment, such as PixArt-$\\alpha$ and\nKandinsky v2.2. The code is available at\nhttps://github.com/luping-liu/LongAlign.\n","authors":["Luping Liu","Chao Du","Tianyu Pang","Zehan Wang","Chongxuan Li","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2410.11817v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03836v3","updated":"2025-03-02T06:41:56Z","published":"2025-01-07T14:45:39Z","title":"SCC-YOLO: An Improved Object Detector for Assisting in Brain Tumor\n Diagnosis","summary":" Brain tumors can lead to neurological dysfunction, cognitive and\npsychological changes, increased intracranial pressure, and seizures, posing\nsignificant risks to health. The You Only Look Once (YOLO) series has shown\nsuperior accuracy in medical imaging object detection. This paper presents a\nnovel SCC-YOLO architecture that integrates the SCConv module into YOLOv9. The\nSCConv module optimizes convolutional efficiency by reducing spatial and\nchannel redundancy, enhancing image feature learning. We examine the effects of\ndifferent attention mechanisms with YOLOv9 for brain tumor detection using the\nBr35H dataset and our custom dataset (Brain_Tumor_Dataset). Results indicate\nthat SCC-YOLO improved mAP50 by 0.3% on the Br35H dataset and by 0.5% on our\ncustom dataset compared to YOLOv9. SCC-YOLO achieves state-of-the-art\nperformance in brain tumor detection.\n","authors":["Runci Bai","Guibao Xu","Yanze Shi"],"pdf_url":"https://arxiv.org/pdf/2501.03836v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03173v2","updated":"2025-03-02T06:24:05Z","published":"2024-12-04T09:53:09Z","title":"IRisPath: Enhancing Costmap for Off-Road Navigation with Robust IR-RGB\n Fusion for Improved Day and Night Traversability","summary":" Autonomous off-road navigation is required for applications in agriculture,\nconstruction, search and rescue and defence. Traditional on-road autonomous\nmethods struggle with dynamic terrains, leading to poor vehicle control in\noff-road conditions. Recent deep-learning models have used perception sensors\nalong with kinesthetic feedback for navigation on such terrains. However, this\napproach has out-of-domain uncertainty. Factors like change in time of day and\nweather impacts the performance of the model. We propose a multi modal fusion\nnetwork \"IRisPath\" capable of using Thermal and RGB images to provide\nrobustness against dynamic weather and light conditions. To aid further works\nin this domain, we also open-source a day-night dataset with Thermal and RGB\nimages along with pseudo-labels for traversability. In order to co-register for\nfusion model we also develop a novel method for targetless extrinsic\ncalibration of Thermal, LiDAR and RGB cameras with translation accuracy of\n+/-1.7cm and rotation accuracy of +/-0.827degrees.\n","authors":["Saksham Sharma","Akshit Raizada","Suresh Sundaram"],"pdf_url":"https://arxiv.org/pdf/2412.03173v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.12033v2","updated":"2025-03-02T06:19:58Z","published":"2023-06-21T05:48:51Z","title":"End-to-End Augmentation Hyperparameter Tuning for Self-Supervised\n Anomaly Detection","summary":" Self-supervised learning (SSL) has emerged as a promising paradigm that\npresents supervisory signals to real-world problems, bypassing the extensive\ncost of manual labeling. Consequently, self-supervised anomaly detection (SSAD)\nhas seen a recent surge of interest, since SSL is especially attractive for\nunsupervised tasks. However, recent works have reported that the choice of a\ndata augmentation function has significant impact on the accuracy of SSAD,\nposing augmentation search as an essential but nontrivial problem with the lack\nof labeled validation data. In this paper, we introduce ST-SSAD, the first\nsystematic approach for rigorous augmentation tuning on SSAD. To this end, our\nwork presents two key contributions. The first is a new unsupervised validation\nloss that quantifies the alignment between augmented training data and\nunlabeled validation data. The second is new differentiable augmentation\nfunctions, allowing data augmentation hyperparameter(s) to be tuned in an\nend-to-end manner. Experiments on two testbeds with semantic class anomalies\nand subtle industrial defects show that ST-SSAD gives significant performance\ngains over existing works.\n","authors":["Jaemin Yoo","Lingxiao Zhao","Leman Akoglu"],"pdf_url":"https://arxiv.org/pdf/2306.12033v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.03051v3","updated":"2025-03-02T06:17:12Z","published":"2024-10-04T00:13:54Z","title":"AuroraCap: Efficient, Performant Video Detailed Captioning and a New\n Benchmark","summary":" Video detailed captioning is a key task which aims to generate comprehensive\nand coherent textual descriptions of video content, benefiting both video\nunderstanding and generation. In this paper, we propose AuroraCap, a video\ncaptioner based on a large multimodal model. We follow the simplest\narchitecture design without additional parameters for temporal modeling. To\naddress the overhead caused by lengthy video sequences, we implement the token\nmerging strategy, reducing the number of input visual tokens. Surprisingly, we\nfound that this strategy results in little performance loss. AuroraCap shows\nsuperior performance on various video and image captioning benchmarks, for\nexample, obtaining a CIDEr of 88.9 on Flickr30k, beating GPT-4V (55.3) and\nGemini-1.5 Pro (82.2). However, existing video caption benchmarks only include\nsimple descriptions, consisting of a few dozen words, which limits research in\nthis field. Therefore, we develop VDC, a video detailed captioning benchmark\nwith over one thousand carefully annotated structured captions. In addition, we\npropose a new LLM-assisted metric VDCscore for bettering evaluation, which\nadopts a divide-and-conquer strategy to transform long caption evaluation into\nmultiple short question-answer pairs. With the help of human Elo ranking, our\nexperiments show that this benchmark better correlates with human judgments of\nvideo detailed captioning quality.\n","authors":["Wenhao Chai","Enxin Song","Yilun Du","Chenlin Meng","Vashisht Madhavan","Omer Bar-Tal","Jenq-Neng Hwang","Saining Xie","Christopher D. Manning"],"pdf_url":"https://arxiv.org/pdf/2410.03051v3.pdf","comment":"Accepted to ICLR 2025. Code, docs, weight, benchmark and training\n data are all avaliable at https://rese1f.github.io/aurora-web/"},{"id":"http://arxiv.org/abs/2502.19260v2","updated":"2025-03-02T06:08:34Z","published":"2025-02-26T16:06:35Z","title":"EMT: A Visual Multi-Task Benchmark Dataset for Autonomous Driving in the\n Arab Gulf Region","summary":" This paper introduces the Emirates Multi-Task (EMT) dataset - the first\npublicly available dataset for autonomous driving collected in the Arab Gulf\nregion. The EMT dataset captures the unique road topology, high traffic\ncongestion, and distinctive characteristics of the Gulf region, including\nvariations in pedestrian clothing and weather conditions. It contains over\n30,000 frames from a dash-camera perspective, along with 570,000 annotated\nbounding boxes, covering approximately 150 kilometers of driving routes. The\nEMT dataset supports three primary tasks: tracking, trajectory forecasting and\nintention prediction. Each benchmark dataset is complemented with corresponding\nevaluations: (1) multi-agent tracking experiments, focusing on multi-class\nscenarios and occlusion handling; (2) trajectory forecasting evaluation using\ndeep sequential and interaction-aware models; and (3) intention benchmark\nexperiments conducted for predicting agents intentions from observed\ntrajectories. The dataset is publicly available at avlab.io/emt-dataset, and\npre-processing scripts along with evaluation models can be accessed at\ngithub.com/AV-Lab/emt-dataset.\n","authors":["Nadya Abdel Madjid","Murad Mebrahtu","Abdelmoamen Nasser","Bilal Hassan","Naoufel Werghi","Jorge Dias","Majid Khonji"],"pdf_url":"https://arxiv.org/pdf/2502.19260v2.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2502.08813v2","updated":"2025-03-02T05:50:08Z","published":"2025-02-12T21:55:26Z","title":"Measuring Anxiety Levels with Head Motion Patterns in Severe Depression\n Population","summary":" Depression and anxiety are prevalent mental health disorders that frequently\ncooccur, with anxiety significantly influencing both the manifestation and\ntreatment of depression. An accurate assessment of anxiety levels in\nindividuals with depression is crucial to develop effective and personalized\ntreatment plans. This study proposes a new noninvasive method for quantifying\nanxiety severity by analyzing head movements -- specifically speed,\nacceleration, and angular displacement -- during video-recorded interviews with\npatients suffering from severe depression. Using data from a new CALYPSO\nDepression Dataset, we extracted head motion characteristics and applied\nregression analysis to predict clinically evaluated anxiety levels. Our results\ndemonstrate a high level of precision, achieving a mean absolute error (MAE) of\n0.35 in predicting the severity of psychological anxiety based on head movement\npatterns. This indicates that our approach can enhance the understanding of\nanxiety's role in depression and assist psychiatrists in refining treatment\nstrategies for individuals.\n","authors":["Fouad Boutaleb","Emery Pierson","Nicolas Doudeau","Clémence Nineuil","Ali Amad","Mohamed Daoudi"],"pdf_url":"https://arxiv.org/pdf/2502.08813v2.pdf","comment":"19th IEEE International Conference on Automatic Face and Gesture\n Recognition (FG), 2025"},{"id":"http://arxiv.org/abs/2411.01099v2","updated":"2025-03-02T05:33:33Z","published":"2024-11-02T01:31:47Z","title":"Few-Class Arena: A Benchmark for Efficient Selection of Vision Models\n and Dataset Difficulty Measurement","summary":" We propose Few-Class Arena (FCA), as a unified benchmark with focus on\ntesting efficient image classification models for few classes. A wide variety\nof benchmark datasets with many classes (80-1000) have been created to assist\nComputer Vision architectural evolution. An increasing number of vision models\nare evaluated with these many-class datasets. However, real-world applications\noften involve substantially fewer classes of interest (2-10). This gap between\nmany and few classes makes it difficult to predict performance of the few-class\napplications using models trained on the available many-class datasets. To\ndate, little has been offered to evaluate models in this Few-Class Regime. We\nconduct a systematic evaluation of the ResNet family trained on ImageNet\nsubsets from 2 to 1000 classes, and test a wide spectrum of Convolutional\nNeural Networks and Transformer architectures over ten datasets by using our\nnewly proposed FCA tool. Furthermore, to aid an up-front assessment of dataset\ndifficulty and a more efficient selection of models, we incorporate a\ndifficulty measure as a function of class similarity. FCA offers a new tool for\nefficient machine learning in the Few-Class Regime, with goals ranging from a\nnew efficient class similarity proposal, to lightweight model architecture\ndesign, to a new scaling law. FCA is user-friendly and can be easily extended\nto new models and datasets, facilitating future research work. Our benchmark is\navailable at https://github.com/bryanbocao/fca.\n","authors":["Bryan Bo Cao","Lawrence O'Gorman","Michael Coss","Shubham Jain"],"pdf_url":"https://arxiv.org/pdf/2411.01099v2.pdf","comment":"10 pages, 32 pages including References and Appendix, 19 figures, 8\n tables"},{"id":"http://arxiv.org/abs/2410.03816v2","updated":"2025-03-02T05:29:42Z","published":"2024-10-04T18:47:49Z","title":"Modeling and Analysis of Spatial and Temporal Land Clutter Statistics in\n SAR Imaging Based on MSTAR Data","summary":" The statistical analysis of land clutter for Synthetic Aperture Radar (SAR)\nimaging has become an increasingly important subject for research and\ninvestigation. It is also absolutely necessary for designing robust algorithms\ncapable of performing the task of target detection in the background clutter.\nAny attempt to extract the energy of the desired targets from the land clutter\nrequires complete knowledge of the statistical properties of the background\nclutter. In this paper, the spatial as well as the temporal characteristics of\nthe land clutter are studied. Since the data for each image has been collected\nbased on a different aspect angle; therefore, the temporal analysis contains\nvariation in the aspect angle. Consequently, the temporal analysis includes the\ncharacteristics of the radar cross section with respect to the aspect angle\nbased on which the data has been collected. In order to perform the statistical\nanalysis, several well-known and relevant distributions, namely, Weibull,\nLog-normal, Gamma, and Rayleigh are considered as prime candidates to model the\nland clutter. The goodness-of-fit test is based on the Kullback-Leibler (KL)\nDivergence metric. The detailed analysis presented in this paper demonstrates\nthat the Weibull distribution is a more accurate fit for the\ntemporal-aspect-angle statistical analysis while the Rayleigh distribution\nmodels the spatial characteristics of the background clutter with higher\naccuracy. Finally, based on the aforementioned statistical analyses and by\nutilizing the Constant False Alarm Rate (CFAR) algorithm, we perform target\ndetection in land clutter. The overall verification of the analysis is\nperformed by exploiting the Moving and Stationary Target Acquisition and\nRecognition (MSTAR) data-set, which has been collected in spotlight mode at\nX-band, and the results are presented.\n","authors":["Shahrokh Hamidi"],"pdf_url":"https://arxiv.org/pdf/2410.03816v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2409.02155"},{"id":"http://arxiv.org/abs/2408.08258v3","updated":"2025-03-02T04:25:12Z","published":"2024-08-15T16:59:15Z","title":"Snuffy: Efficient Whole Slide Image Classifier","summary":" Whole Slide Image (WSI) classification with multiple instance learning (MIL)\nin digital pathology faces significant computational challenges. Current\nmethods mostly rely on extensive self-supervised learning (SSL) for\nsatisfactory performance, requiring long training periods and considerable\ncomputational resources. At the same time, no pre-training affects performance\ndue to domain shifts from natural images to WSIs. We introduce Snuffy\narchitecture, a novel MIL-pooling method based on sparse transformers that\nmitigates performance loss with limited pre-training and enables continual\nfew-shot pre-training as a competitive option. Our sparsity pattern is tailored\nfor pathology and is theoretically proven to be a universal approximator with\nthe tightest probabilistic sharp bound on the number of layers for sparse\ntransformers, to date. We demonstrate Snuffy's effectiveness on CAMELYON16 and\nTCGA Lung cancer datasets, achieving superior WSI and patch-level accuracies.\nThe code is available on https://github.com/jafarinia/snuffy.\n","authors":["Hossein Jafarinia","Alireza Alipanah","Danial Hamdi","Saeed Razavi","Nahal Mirzaie","Mohammad Hossein Rohban"],"pdf_url":"https://arxiv.org/pdf/2408.08258v3.pdf","comment":"Accepted for ECCV 2024"},{"id":"http://arxiv.org/abs/2405.16071v2","updated":"2025-03-02T04:18:55Z","published":"2024-05-25T05:44:55Z","title":"DynRefer: Delving into Region-level Multimodal Tasks via Dynamic\n Resolution","summary":" One fundamental task of multimodal models is to translate referred image\nregions to human preferred language descriptions. Existing methods, however,\nignore the resolution adaptability needs of different tasks, which hinders them\nto find out precise language descriptions. In this study, we propose a DynRefer\napproach, to pursue high-accuracy region-level referring through mimicking the\nresolution adaptability of human visual cognition. During training, DynRefer\nstochastically aligns language descriptions of multimodal tasks with images of\nmultiple resolutions, which are constructed by nesting a set of random views\naround the referred region. During inference, DynRefer performs selectively\nmultimodal referring by sampling proper region representations for tasks from\nthe nested views based on image and task priors. This allows the visual\ninformation for referring to better match human preferences, thereby improving\nthe representational adaptability of region-level multimodal models.\nExperiments show that DynRefer brings mutual improvement upon broad tasks\nincluding region-level captioning, open-vocabulary region recognition and\nattribute detection. Furthermore, DynRefer achieves state-of-the-art results on\nmultiple region-level multimodal tasks using a single model. Code is available\nat https://github.com/callsys/DynRefer.\n","authors":["Yuzhong Zhao","Feng Liu","Yue Liu","Mingxiang Liao","Chen Gong","Qixiang Ye","Fang Wan"],"pdf_url":"https://arxiv.org/pdf/2405.16071v2.pdf","comment":"Accepted in CVPR 2025. Code is available at\n https://github.com/callsys/DynRefer"},{"id":"http://arxiv.org/abs/2411.18018v2","updated":"2025-03-02T04:05:24Z","published":"2024-11-27T03:21:57Z","title":"Neural Finite-State Machines for Surgical Phase Recognition","summary":" Surgical phase recognition (SPR) is crucial for applications in workflow\noptimization, performance evaluation, and real-time intervention guidance.\nHowever, current deep learning models often struggle with fragmented\npredictions, failing to capture the sequential nature of surgical workflows. We\npropose the Neural Finite-State Machine (NFSM), a novel approach that enforces\ntemporal coherence by integrating classical state-transition priors with modern\nneural networks. NFSM leverages learnable global state embeddings as unique\nphase identifiers and dynamic transition tables to model phase-to-phase\nprogressions. Additionally, a future phase forecasting mechanism employs\nrepeated frame padding to anticipate upcoming transitions. Implemented as a\nplug-and-play module, NFSM can be integrated into existing SPR pipelines\nwithout changing their core architectures. We demonstrate state-of-the-art\nperformance across multiple benchmarks, including a significant improvement on\nthe BernBypass70 dataset - raising video-level accuracy by 0.9 points and\nphase-level precision, recall, F1-score, and mAP by 3.8, 3.1, 3.3, and 4.1,\nrespectively. Ablation studies confirm each component's effectiveness and the\nmodule's adaptability to various architectures. By unifying finite-state\nprinciples with deep learning, NFSM offers a robust path toward consistent,\nlong-term surgical video analysis.\n","authors":["Hao Ding","Zhongpai Gao","Benjamin Planche","Tianyu Luan","Abhishek Sharma","Meng Zheng","Ange Lou","Terrence Chen","Mathias Unberath","Ziyan Wu"],"pdf_url":"https://arxiv.org/pdf/2411.18018v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11069v3","updated":"2025-03-02T03:01:19Z","published":"2025-01-19T15:05:15Z","title":"Refinement Module based on Parse Graph of Feature Map for Human Pose\n Estimation","summary":" Parse graphs of the human body can be obtained in the human brain to help\nhumans complete the human Pose Estimation better (HPE). It contains a\nhierarchical structure, like a tree structure, and context relations among\nnodes. To equip models with such capabilities, many researchers predefine the\nparse graph of body structure to design HPE frameworks. However, these\nframeworks struggle to adapt to instances that deviate from the predefined\nparse graph and are often parameter-heavy. Unlike them, we view the feature map\nholistically, much like the human body. It can be optimized using parse graphs,\nwhere each node's feature is an implicit expression rather than a fixed one.\nThis allows it to adapt to more instances, unconstrained by rigid structural\nfeatures. In this paper, we design the Refinement Module based on the Parse\nGraph of feature map (RMPG), which includes two stages: top-down decomposition\nand bottom-up combination. In the first stage, the feature map is decomposed\ninto multiple sub-feature maps along the channel. In the second stage, the\ncontext relations of sub-feature maps are calculated to obtain their respective\ncontext information and the sub-feature maps with context information are\nconcatenated along channels to obtain the refined feature map. Additionally, we\ndesign a hierarchical network with fewer parameters using multiple RMPG modules\nto model the context relations and hierarchies in the parse graph of body\nstructure for HPE, some of which are supervised to obtain context relations\namong body parts. Our network achieves excellent results on multiple mainstream\nhuman pose datasets. More importantly, the effectiveness of RMPG is proven on\ndifferent methods. The code of RMPG will be open.\n","authors":["Shibang Liu","Xuemei Xie","Guangming Shi"],"pdf_url":"https://arxiv.org/pdf/2501.11069v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.20026v2","updated":"2025-03-02T02:45:56Z","published":"2024-10-26T00:49:06Z","title":"Towards Robust Algorithms for Surgical Phase Recognition via Digital\n Twin Representation","summary":" Surgical phase recognition (SPR) is an integral component of surgical data\nscience, enabling high-level surgical analysis. End-to-end trained neural\nnetworks that predict surgical phase directly from videos have shown excellent\nperformance on benchmarks. However, these models struggle with robustness due\nto non-causal associations in the training set. Our goal is to improve model\nrobustness to variations in the surgical videos by leveraging the digital twin\n(DT) paradigm -- an intermediary layer to separate high-level analysis (SPR)\nfrom low-level processing. As a proof of concept, we present a DT\nrepresentation-based framework for SPR from videos. The framework employs\nvision foundation models with reliable low-level scene understanding to craft\nDT representation. We embed the DT representation in place of raw video inputs\nin the state-of-the-art SPR model. The framework is trained on the Cholec80\ndataset and evaluated on out-of-distribution (OOD) and corrupted test samples.\nContrary to the vulnerability of the baseline model, our framework demonstrates\nstrong robustness on both OOD and corrupted samples, with a video-level\naccuracy of 80.3 on a highly corrupted Cholec80 test set, 67.9 on the\nchallenging CRCD dataset, and 99.8 on an internal robotic surgery dataset,\noutperforming the baseline by 3.9, 16.8, and 90.9 respectively. We also find\nthat using DT representation as an augmentation to the raw input can\nsignificantly improve model robustness. Our findings lend support to the thesis\nthat DT representations are effective in enhancing model robustness. Future\nwork will seek to improve the feature informativeness and incorporate\ninterpretability for a more comprehensive framework.\n","authors":["Hao Ding","Yuqian Zhang","Wenzheng Cheng","Xinyu Wang","Xu Lian","Chenhao Yu","Hongchao Shu","Ji Woong Kim","Axel Krieger","Mathias Unberath"],"pdf_url":"https://arxiv.org/pdf/2410.20026v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04364v4","updated":"2025-03-02T02:32:25Z","published":"2024-01-09T05:32:22Z","title":"SoK: Systematization and Benchmarking of Deepfake Detectors in a Unified\n Framework","summary":" Deepfakes have rapidly emerged as a serious threat to society due to their\nease of creation and dissemination, triggering the accelerated development of\ndetection technologies. However, many existing detectors rely on labgenerated\ndatasets for validation, which may not prepare them for novel, real-world\ndeepfakes. This paper extensively reviews and analyzes state-of-the-art\ndeepfake detectors, evaluating them against several critical criteria. These\ncriteria categorize detectors into 4 high-level groups and 13 finegrained\nsub-groups, aligned with a unified conceptual framework we propose. This\nclassification offers practical insights into the factors affecting detector\nefficacy. We evaluate the generalizability of 16 leading detectors across\ncomprehensive attack scenarios, including black-box, white-box, and graybox\nsettings. Our systematized analysis and experiments provide a deeper\nunderstanding of deepfake detectors and their generalizability, paving the way\nfor future research and the development of more proactive defenses against\ndeepfakes.\n","authors":["Binh M. Le","Jiwon Kim","Simon S. Woo","Kristen Moore","Alsharif Abuadbba","Shahroz Tariq"],"pdf_url":"https://arxiv.org/pdf/2401.04364v4.pdf","comment":"20 pages, 6 figures, 7 table, Accepted at IEEE European Symposium on\n security and privacy 2025 (EuroS&P '25)"},{"id":"http://arxiv.org/abs/2410.05470v2","updated":"2025-03-02T02:07:21Z","published":"2024-10-07T20:04:29Z","title":"Image Watermarks are Removable Using Controllable Regeneration from\n Clean Noise","summary":" Image watermark techniques provide an effective way to assert ownership,\ndeter misuse, and trace content sources, which has become increasingly\nessential in the era of large generative models. A critical attribute of\nwatermark techniques is their robustness against various manipulations. In this\npaper, we introduce a watermark removal approach capable of effectively\nnullifying state-of-the-art watermarking techniques. Our primary insight\ninvolves regenerating the watermarked image starting from a clean Gaussian\nnoise via a controllable diffusion model, utilizing the extracted semantic and\nspatial features from the watermarked image. The semantic control adapter and\nthe spatial control network are specifically trained to control the denoising\nprocess towards ensuring image quality and enhancing consistency between the\ncleaned image and the original watermarked image. To achieve a smooth trade-off\nbetween watermark removal performance and image consistency, we further propose\nan adjustable and controllable regeneration scheme. This scheme adds varying\nnumbers of noise steps to the latent representation of the watermarked image,\nfollowed by a controlled denoising process starting from this noisy latent\nrepresentation. As the number of noise steps increases, the latent\nrepresentation progressively approaches clean Gaussian noise, facilitating the\ndesired trade-off. We apply our watermark removal methods across various\nwatermarking techniques, and the results demonstrate that our methods offer\nsuperior visual consistency/quality and enhanced watermark removal performance\ncompared to existing regeneration approaches. Our code is available at\nhttps://github.com/yepengliu/CtrlRegen.\n","authors":["Yepeng Liu","Yiren Song","Hai Ci","Yu Zhang","Haofan Wang","Mike Zheng Shou","Yuheng Bu"],"pdf_url":"https://arxiv.org/pdf/2410.05470v2.pdf","comment":"ICLR2025"},{"id":"http://arxiv.org/abs/2502.10603v2","updated":"2025-03-02T01:50:22Z","published":"2025-02-14T23:18:54Z","title":"Adaptive Neural Networks for Intelligent Data-Driven Development","summary":" Advances in machine learning methods for computer vision tasks have led to\ntheir consideration for safety-critical applications like autonomous driving.\nHowever, effectively integrating these methods into the automotive development\nlifecycle remains challenging. Since the performance of machine learning\nalgorithms relies heavily on the training data provided, the data and model\ndevelopment lifecycle play a key role in successfully integrating these\ncomponents into the product development lifecycle. Existing models frequently\nencounter difficulties recognizing or adapting to novel instances not present\nin the original training dataset. This poses a significant risk for reliable\ndeployment in dynamic environments. To address this challenge, we propose an\nadaptive neural network architecture and an iterative development framework\nthat enables users to efficiently incorporate previously unknown objects into\nthe current perception system. Our approach builds on continuous learning,\nemphasizing the necessity of dynamic updates to reflect real-world deployment\nconditions. Specifically, we introduce a pipeline with three key components:\n(1) a scalable network extension strategy to integrate new classes while\npreserving existing performance, (2) a dynamic OoD detection component that\nrequires no additional retraining for newly added classes, and (3) a\nretrieval-based data augmentation process tailored for safety-critical\ndeployments. The integration of these components establishes a pragmatic and\nadaptive pipeline for the continuous evolution of perception systems in the\ncontext of autonomous driving.\n","authors":["Youssef Shoeb","Azarm Nowzad","Hanno Gottschalk"],"pdf_url":"https://arxiv.org/pdf/2502.10603v2.pdf","comment":"8 pages, 3 figures, and 3 tables"},{"id":"http://arxiv.org/abs/2412.05707v3","updated":"2025-03-02T01:46:15Z","published":"2024-12-07T17:40:20Z","title":"Segment-Level Road Obstacle Detection Using Visual Foundation Model\n Priors and Likelihood Ratios","summary":" Detecting road obstacles is essential for autonomous vehicles to navigate\ndynamic and complex traffic environments safely. Current road obstacle\ndetection methods typically assign a score to each pixel and apply a threshold\nto generate final predictions. However, selecting an appropriate threshold is\nchallenging, and the per-pixel classification approach often leads to\nfragmented predictions with numerous false positives. In this work, we propose\na novel method that leverages segment-level features from visual foundation\nmodels and likelihood ratios to predict road obstacles directly. By focusing on\nsegments rather than individual pixels, our approach enhances detection\naccuracy, reduces false positives, and offers increased robustness to scene\nvariability. We benchmark our approach against existing methods on the\nRoadObstacle and LostAndFound datasets, achieving state-of-the-art performance\nwithout needing a predefined threshold.\n","authors":["Youssef Shoeb","Nazir Nayal","Azarm Nowzad","Fatma Güney","Hanno Gottschalk"],"pdf_url":"https://arxiv.org/pdf/2412.05707v3.pdf","comment":"10 pages, 4 figures, and 1 table, to be published in VISAPP 2025"},{"id":"http://arxiv.org/abs/2410.10594v2","updated":"2025-03-02T01:19:51Z","published":"2024-10-14T15:04:18Z","title":"VisRAG: Vision-based Retrieval-augmented Generation on Multi-modality\n Documents","summary":" Retrieval-augmented generation (RAG) is an effective technique that enables\nlarge language models (LLMs) to utilize external knowledge sources for\ngeneration. However, current RAG systems are solely based on text, rendering it\nimpossible to utilize vision information like layout and images that play\ncrucial roles in real-world multi-modality documents. In this paper, we\nintroduce VisRAG, which tackles this issue by establishing a vision-language\nmodel (VLM)-based RAG pipeline. In this pipeline, instead of first parsing the\ndocument to obtain text, the document is directly embedded using a VLM as an\nimage and then retrieved to enhance the generation of a VLM. Compared to\ntraditional text-based RAG, VisRAG maximizes the retention and utilization of\nthe data information in the original documents, eliminating the information\nloss introduced during the parsing process. We collect both open-source and\nsynthetic data to train the retriever in VisRAG and explore a variety of\ngeneration methods. Experiments demonstrate that VisRAG outperforms traditional\nRAG in both the retrieval and generation stages, achieving a 20--40% end-to-end\nperformance gain over traditional text-based RAG pipeline. Further analysis\nreveals that VisRAG is efficient in utilizing training data and demonstrates\nstrong generalization capability, positioning it as a promising solution for\nRAG on multi-modality documents. Our code and data are available at\nhttps://github.com/openbmb/visrag.\n","authors":["Shi Yu","Chaoyue Tang","Bokai Xu","Junbo Cui","Junhao Ran","Yukun Yan","Zhenghao Liu","Shuo Wang","Xu Han","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2410.10594v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11219v3","updated":"2025-03-02T01:07:41Z","published":"2024-09-17T14:12:50Z","title":"Score Forgetting Distillation: A Swift, Data-Free Method for Machine\n Unlearning in Diffusion Models","summary":" The machine learning community is increasingly recognizing the importance of\nfostering trust and safety in modern generative AI (GenAI) models. We posit\nmachine unlearning (MU) as a crucial foundation for developing safe, secure,\nand trustworthy GenAI models. Traditional MU methods often rely on stringent\nassumptions and require access to real data. This paper introduces Score\nForgetting Distillation (SFD), an innovative MU approach that promotes the\nforgetting of undesirable information in diffusion models by aligning the\nconditional scores of \"unsafe\" classes or concepts with those of \"safe\" ones.\nTo eliminate the need for real data, our SFD framework incorporates a\nscore-based MU loss into the score distillation objective of a pretrained\ndiffusion model. This serves as a regularization term that preserves desired\ngeneration capabilities while enabling the production of synthetic data through\na one-step generator. Our experiments on pretrained label-conditional and\ntext-to-image diffusion models demonstrate that our method effectively\naccelerates the forgetting of target classes or concepts during generation,\nwhile preserving the quality of other classes or concepts. This unlearned and\ndistilled diffusion not only pioneers a novel concept in MU but also\naccelerates the generation speed of diffusion models. Our experiments and\nstudies on a range of diffusion models and datasets confirm that our approach\nis generalizable, effective, and advantageous for MU in diffusion models. Code\nis available at https://github.com/tqch/score-forgetting-distillation.\n($\\textbf{Warning:}$ This paper contains sexually explicit imagery, discussions\nof pornography, racially-charged terminology, and other content that some\nreaders may find disturbing, distressing, and/or offensive.)\n","authors":["Tianqi Chen","Shujian Zhang","Mingyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.11219v3.pdf","comment":"ICLR 2025"},{"id":"http://arxiv.org/abs/2502.02283v3","updated":"2025-03-02T00:25:45Z","published":"2025-02-04T12:50:16Z","title":"GP-GS: Gaussian Processes for Enhanced Gaussian Splatting","summary":" 3D Gaussian Splatting has emerged as an efficient photorealistic novel view\nsynthesis method. However, its reliance on sparse Structure-from-Motion (SfM)\npoint clouds consistently compromises the scene reconstruction quality. To\naddress these limitations, this paper proposes a novel 3D reconstruction\nframework Gaussian Processes Gaussian Splatting (GP-GS), where a multi-output\nGaussian Process model is developed to achieve adaptive and uncertainty-guided\ndensification of sparse SfM point clouds. Specifically, we propose a dynamic\nsampling and filtering pipeline that adaptively expands the SfM point clouds by\nleveraging GP-based predictions to infer new candidate points from the input 2D\npixels and depth maps. The pipeline utilizes uncertainty estimates to guide the\npruning of high-variance predictions, ensuring geometric consistency and\nenabling the generation of dense point clouds. The densified point clouds\nprovide high-quality initial 3D Gaussians to enhance reconstruction\nperformance. Extensive experiments conducted on synthetic and real-world\ndatasets across various scales validate the effectiveness and practicality of\nthe proposed framework.\n","authors":["Zhihao Guo","Jingxuan Su","Shenglin Wang","Jinlong Fan","Jing Zhang","Liangxiu Han","Peng Wang"],"pdf_url":"https://arxiv.org/pdf/2502.02283v3.pdf","comment":"14 pages,11 figures"},{"id":"http://arxiv.org/abs/2411.18810v4","updated":"2025-03-02T00:15:11Z","published":"2024-11-27T23:32:54Z","title":"All Seeds Are Not Equal: Enhancing Compositional Text-to-Image\n Generation with Reliable Random Seeds","summary":" Text-to-image diffusion models have demonstrated remarkable capability in\ngenerating realistic images from arbitrary text prompts. However, they often\nproduce inconsistent results for compositional prompts such as \"two dogs\" or \"a\npenguin on the right of a bowl\". Understanding these inconsistencies is crucial\nfor reliable image generation. In this paper, we highlight the significant role\nof initial noise in these inconsistencies, where certain noise patterns are\nmore reliable for compositional prompts than others. Our analyses reveal that\ndifferent initial random seeds tend to guide the model to place objects in\ndistinct image areas, potentially adhering to specific patterns of camera\nangles and image composition associated with the seed. To improve the model's\ncompositional ability, we propose a method for mining these reliable cases,\nresulting in a curated training set of generated images without requiring any\nmanual annotation. By fine-tuning text-to-image models on these generated\nimages, we significantly enhance their compositional capabilities. For\nnumerical composition, we observe relative increases of 29.3% and 19.5% for\nStable Diffusion and PixArt-{\\alpha}, respectively. Spatial composition sees\neven larger gains, with 60.7% for Stable Diffusion and 21.1% for\nPixArt-{\\alpha}.\n","authors":["Shuangqi Li","Hieu Le","Jingyi Xu","Mathieu Salzmann"],"pdf_url":"https://arxiv.org/pdf/2411.18810v4.pdf","comment":null}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2411.11856v2","updated":"2025-03-02T23:43:15Z","published":"2024-11-01T17:33:28Z","title":"Automatically Improving LLM-based Verilog Generation using EDA Tool\n Feedback","summary":" Traditionally, digital hardware designs are written in the Verilog hardware\ndescription language (HDL) and debugged manually by engineers. This can be\ntime-consuming and error-prone for complex designs. Large Language Models\n(LLMs) are emerging as a potential tool to help generate fully functioning HDL\ncode, but most works have focused on generation in the single-shot capacity:\ni.e., run and evaluate, a process that does not leverage debugging and, as\nsuch, does not adequately reflect a realistic development process. In this\nwork, we evaluate the ability of LLMs to leverage feedback from electronic\ndesign automation (EDA) tools to fix mistakes in their own generated Verilog.\nTo accomplish this, we present an open-source, highly customizable framework,\nAutoChip, which combines conversational LLMs with the output from Verilog\ncompilers and simulations to iteratively generate and repair Verilog. To\ndetermine the success of these LLMs we leverage the VerilogEval benchmark set.\nWe evaluate four state-of-the-art conversational LLMs, focusing on readily\naccessible commercial models. EDA tool feedback proved to be consistently more\neffective than zero-shot prompting only with GPT-4o, the most computationally\ncomplex model we evaluated. In the best case, we observed a 5.8% increase in\nthe number of successful designs with a 34.2% decrease in cost over the best\nzero-shot results. Mixing smaller models with this larger model at the end of\nthe feedback iterations resulted in equally as much success as with GPT-4o\nusing feedback, but incurred 41.9% lower cost (corresponding to an overall\ndecrease in cost over zero-shot by 89.6%).\n","authors":["Jason Blocklove","Shailja Thakur","Benjamin Tan","Hammond Pearce","Siddharth Garg","Ramesh Karri"],"pdf_url":"https://arxiv.org/pdf/2411.11856v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15998v2","updated":"2025-03-02T23:41:37Z","published":"2024-08-28T17:59:31Z","title":"Eagle: Exploring The Design Space for Multimodal LLMs with Mixture of\n Encoders","summary":" The ability to accurately interpret complex visual information is a crucial\ntopic of multimodal large language models (MLLMs). Recent work indicates that\nenhanced visual perception significantly reduces hallucinations and improves\nperformance on resolution-sensitive tasks, such as optical character\nrecognition and document analysis. A number of recent MLLMs achieve this goal\nusing a mixture of vision encoders. Despite their success, there is a lack of\nsystematic comparisons and detailed ablation studies addressing critical\naspects, such as expert selection and the integration of multiple vision\nexperts. This study provides an extensive exploration of the design space for\nMLLMs using a mixture of vision encoders and resolutions. Our findings reveal\nseveral underlying principles common to various existing strategies, leading to\na streamlined yet effective design approach. We discover that simply\nconcatenating visual tokens from a set of complementary vision encoders is as\neffective as more complex mixing architectures or strategies. We additionally\nintroduce Pre-Alignment to bridge the gap between vision-focused encoders and\nlanguage tokens, enhancing model coherence. The resulting family of MLLMs,\nEagle, surpasses other leading open-source models on major MLLM benchmarks.\n","authors":["Min Shi","Fuxiao Liu","Shihao Wang","Shijia Liao","Subhashree Radhakrishnan","Yilin Zhao","De-An Huang","Hongxu Yin","Karan Sapra","Yaser Yacoob","Humphrey Shi","Bryan Catanzaro","Andrew Tao","Jan Kautz","Zhiding Yu","Guilin Liu"],"pdf_url":"https://arxiv.org/pdf/2408.15998v2.pdf","comment":"Github: https://github.com/NVlabs/Eagle, HuggingFace:\n https://huggingface.co/NVEagle"},{"id":"http://arxiv.org/abs/2406.04046v3","updated":"2025-03-02T23:24:43Z","published":"2024-06-06T13:15:37Z","title":"ActionReasoningBench: Reasoning about Actions with and without\n Ramification Constraints","summary":" Reasoning about Actions and Change (RAC) has historically played a pivotal\nrole in solving foundational AI problems, such as the frame problem. It has\ndriven advancements in AI fields, such as non-monotonic and commonsense\nreasoning. RAC remains crucial for AI systems that operate in dynamic\nenvironments, engage in interactive scenarios, or rely on commonsense\nreasoning. Despite substantial advances made by Large Language Models (LLMs) in\nvarious AI domains, their performance in RAC remains underexplored. To address\nthis gap, we introduce a new diagnostic benchmark, ActionReasoningBench, which\nencompasses 8 domains and includes questions for up to 19 action sequences.\nThis benchmark rigorously evaluates LLMs across six key RAC dimensions: Fluent\nTracking, State Tracking, Action Executability, Effects of Actions, Numerical\nRAC, and Composite Questions. LLMs demonstrate average accuracy rates of\n73.55%, 65.63%, 58.73%, and 62.38% on the former four dimensions, which are\nfrequently discussed in RAC literature. However, the performance on the latter\ntwo dimensions, which introduce complex and novel reasoning questions, the\naverage performance of LLMs is lowered to 33.16% and 51.19%, respectively,\nreflecting a 17.9% performance decline. We also introduce new ramification\nconstraints to capture the indirect effects of actions, providing deeper\ninsights into RAC challenges. Our evaluation of state-of-the-art LLMs,\nincluding both open-source and commercial models, reveals challenges across all\nRAC dimensions, particularly in handling ramifications, with GPT-4o failing to\nsolve any question and o1-preview achieving a score of only 18.4%.\n","authors":["Divij Handa","Pavel Dolin","Shrinidhi Kumbhar","Tran Cao Son","Chitta Baral"],"pdf_url":"https://arxiv.org/pdf/2406.04046v3.pdf","comment":"Accepted in ICLR 2025"},{"id":"http://arxiv.org/abs/2407.11249v3","updated":"2025-03-02T22:12:01Z","published":"2024-07-15T21:32:58Z","title":"Disentangling Representations through Multi-task Learning","summary":" Intelligent perception and interaction with the world hinges on internal\nrepresentations that capture its underlying structure (''disentangled'' or\n''abstract'' representations). Disentangled representations serve as world\nmodels, isolating latent factors of variation in the world along approximately\northogonal directions, thus facilitating feature-based generalization. We\nprovide experimental and theoretical results guaranteeing the emergence of\ndisentangled representations in agents that optimally solve multi-task evidence\naccumulation classification tasks, canonical in the neuroscience literature.\nThe key conceptual finding is that, by producing accurate multi-task\nclassification estimates, a system implicitly represents a set of coordinates\nspecifying a disentangled representation of the underlying latent state of the\ndata it receives. The theory provides conditions for the emergence of these\nrepresentations in terms of noise, number of tasks, and evidence accumulation\ntime. We experimentally validate these predictions in RNNs trained to\nmulti-task, which learn disentangled representations in the form of continuous\nattractors, leading to zero-shot out-of-distribution (OOD) generalization in\npredicting latent factors. We demonstrate the robustness of our framework\nacross autoregressive architectures, decision boundary geometries and in tasks\nrequiring classification confidence estimation. We find that transformers are\nparticularly suited for disentangling representations, which might explain\ntheir unique world understanding abilities. Overall, our framework establishes\na formal link between competence at multiple tasks and the formation of\ndisentangled, interpretable world models in both biological and artificial\nsystems, and helps explain why ANNs often arrive at human-interpretable\nconcepts, and how they both may acquire exceptional zero-shot generalization\ncapabilities.\n","authors":["Pantelis Vafidis","Aman Bhargava","Antonio Rangel"],"pdf_url":"https://arxiv.org/pdf/2407.11249v3.pdf","comment":"43 pages, 17 figures"},{"id":"http://arxiv.org/abs/2406.10279v3","updated":"2025-03-02T21:03:52Z","published":"2024-06-12T03:29:06Z","title":"We Have a Package for You! A Comprehensive Analysis of Package\n Hallucinations by Code Generating LLMs","summary":" The reliance of popular programming languages such as Python and JavaScript\non centralized package repositories and open-source software, combined with the\nemergence of code-generating Large Language Models (LLMs), has created a new\ntype of threat to the software supply chain: package hallucinations. These\nhallucinations, which arise from fact-conflicting errors when generating code\nusing LLMs, represent a novel form of package confusion attack that poses a\ncritical threat to the integrity of the software supply chain. This paper\nconducts a rigorous and comprehensive evaluation of package hallucinations\nacross different programming languages, settings, and parameters, exploring how\na diverse set of models and configurations affect the likelihood of generating\nerroneous package recommendations and identifying the root causes of this\nphenomenon. Using 16 popular LLMs for code generation and two unique prompt\ndatasets, we generate 576,000 code samples in two programming languages that we\nanalyze for package hallucinations. Our findings reveal that that the average\npercentage of hallucinated packages is at least 5.2% for commercial models and\n21.7% for open-source models, including a staggering 205,474 unique examples of\nhallucinated package names, further underscoring the severity and pervasiveness\nof this threat. To overcome this problem, we implement several hallucination\nmitigation strategies and show that they are able to significantly reduce the\nnumber of package hallucinations while maintaining code quality. Our\nexperiments and findings highlight package hallucinations as a persistent and\nsystemic phenomenon while using state-of-the-art LLMs for code generation, and\na significant challenge which deserves the research community's urgent\nattention.\n","authors":["Joseph Spracklen","Raveen Wijewickrama","A H M Nazmus Sakib","Anindya Maiti","Bimal Viswanath","Murtuza Jadliwala"],"pdf_url":"https://arxiv.org/pdf/2406.10279v3.pdf","comment":"To appear in the 2025 USENIX Security Symposium. 22 pages, 14\n figures, 8 tables. Edited from original version for submission to a different\n conference. No change to original results or findings"},{"id":"http://arxiv.org/abs/2208.07143v4","updated":"2025-03-02T20:57:50Z","published":"2022-08-15T12:33:29Z","title":"C-Causal Blindness","summary":" This text is concerned with a hypothetical flavour of cognitive blindness\nreferred to in this paper as \\textit{C-Causal Blindness} or C-CB. A cognitive\nblindness where the policy to obtain the objective leads to the state to be\navoided. A literal example of C-CB would be \\textit{Kurt G\\\"odel's} decision to\nstarve for \\textit{\"fear of being poisoned\"} - take this to be premise\n\\textbf{A}. The objective being \\textit{\"to avoid being poisoned (so as to not\ndie)\"}: \\textbf{C}, the plan or policy being \\textit{\"don't eat\"}: \\textbf{B},\nand the actual outcome having been \\textit{\"dying\"}: $\\lnot$\\textbf{C} - the\nstate that G\\\"odel wanted to avoid to begin with. G\\\"odel pursued a strategy\nthat caused the result he wanted to avoid. An experimental computational\nframework is proposed to show the isomorphic relationship between C-CB in brain\ncomputations, logic, and computer computations using a new proposed algorithm:\na Weighted Hidden Markov Model.\n","authors":["Gonçalo Hora de Carvalho"],"pdf_url":"https://arxiv.org/pdf/2208.07143v4.pdf","comment":"restructuring"},{"id":"http://arxiv.org/abs/2410.06232v3","updated":"2025-03-02T20:40:21Z","published":"2024-10-08T17:41:37Z","title":"Range, not Independence, Drives Modularity in Biologically Inspired\n Representations","summary":" Why do biological and artificial neurons sometimes modularise, each encoding\na single meaningful variable, and sometimes entangle their representation of\nmany variables? In this work, we develop a theory of when biologically inspired\nnetworks -- those that are nonnegative and energy efficient -- modularise their\nrepresentation of source variables (sources). We derive necessary and\nsufficient conditions on a sample of sources that determine whether the neurons\nin an optimal biologically-inspired linear autoencoder modularise. Our theory\napplies to any dataset, extending far beyond the case of statistical\nindependence studied in previous work. Rather we show that sources modularise\nif their support is ``sufficiently spread''. From this theory, we extract and\nvalidate predictions in a variety of empirical studies on how data distribution\naffects modularisation in nonlinear feedforward and recurrent neural networks\ntrained on supervised and unsupervised tasks. Furthermore, we apply these ideas\nto neuroscience data, showing that range independence can be used to understand\nthe mixing or modularising of spatial and reward information in entorhinal\nrecordings in seemingly conflicting experiments. Further, we use these results\nto suggest alternate origins of mixed-selectivity, beyond the predominant\ntheory of flexible nonlinear classification. In sum, our theory prescribes\nprecise conditions on when neural activities modularise, providing tools for\ninducing and elucidating modular representations in brains and machines.\n","authors":["Will Dorrell","Kyle Hsu","Luke Hollingsworth","Jin Hwa Lee","Jiajun Wu","Chelsea Finn","Peter E Latham","Tim EJ Behrens","James CR Whittington"],"pdf_url":"https://arxiv.org/pdf/2410.06232v3.pdf","comment":"47 pages, 17 figures. WD and KH contributed equally; LH and JHL\n contributed equally"},{"id":"http://arxiv.org/abs/2402.10767v2","updated":"2025-03-02T20:33:20Z","published":"2024-02-16T15:41:23Z","title":"Inference to the Best Explanation in Large Language Models","summary":" While Large Language Models (LLMs) have found success in real-world\napplications, their underlying explanatory process is still poorly understood.\nThis paper proposes IBE-Eval, a framework inspired by philosophical accounts on\nInference to the Best Explanation (IBE) to advance the interpretation and\nevaluation of LLMs' explanations. IBE-Eval estimates the plausibility of\nnatural language explanations through a combination of explicit logical and\nlinguistic features including: consistency, parsimony, coherence, and\nuncertainty. Extensive experiments are conducted on Causal Question Answering\n(CQA), where \\textit{IBE-Eval} is tasked to select the most plausible causal\nexplanation amongst competing ones generated by LLMs (i.e., GPT 3.5 and Llama\n2). The experiments reveal that IBE-Eval can successfully identify the best\nexplanation with up to 77\\% accuracy ($\\approx 27\\%$ above random), improving\nupon a GPT 3.5-as-a-Judge baseline ($\\approx+17\\%$) while being intrinsically\nmore efficient and interpretable. Additional analyses suggest that, despite\nmodel-specific variances, LLM-generated explanations tend to conform to IBE\ncriteria and that IBE-Eval is significantly correlated with human judgment,\nopening up opportunities for future development of automated explanation\nverification tools.\n","authors":["Dhairya Dalal","Marco Valentino","André Freitas","Paul Buitelaar"],"pdf_url":"https://arxiv.org/pdf/2402.10767v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12534v2","updated":"2025-03-02T20:13:11Z","published":"2024-04-18T22:54:08Z","title":"Lean Copilot: Large Language Models as Copilots for Theorem Proving in\n Lean","summary":" Neural theorem proving combines large language models (LLMs) with proof\nassistants such as Lean, where the correctness of formal proofs can be\nrigorously verified, leaving no room for hallucination. With existing neural\ntheorem provers pretrained on a fixed collection of data and offering valuable\nsuggestions at times, it is challenging for them to continually prove novel\ntheorems in a fully autonomous mode, where human insights may be critical. In\nthis paper, we explore LLMs as copilots that assist humans in proving theorems.\nWe introduce Lean Copilot, an general framework for running LLM inference\nnatively in Lean. It enables programmers to build various LLM-based proof\nautomation tools that integrate seamlessly into the workflow of Lean users.\nLean users can use our pretrained models or bring their own ones that run\neither locally (with or without GPUs) or on the cloud. Using Lean Copilot, we\nbuild LLM-based tools that suggest proof steps, complete proof goals, and\nselect relevant premises. Experimental results on the Mathematics in Lean\ntextbook demonstrate the effectiveness of our method compared to existing\nrule-based proof automation in Lean (aesop). When assisting humans, Lean\nCopilot requires only 2.08 manually-entered proof steps on average (3.86\nrequired by aesop); when automating the theorem proving process, Lean Copilot\nautomates 74.2% proof steps on average, 85% better than aesop (40.1%). We open\nsource all code and artifacts under a permissive MIT license to facilitate\nfurther research.\n","authors":["Peiyang Song","Kaiyu Yang","Anima Anandkumar"],"pdf_url":"https://arxiv.org/pdf/2404.12534v2.pdf","comment":"All code and artifacts open-sourced at\n https://github.com/lean-dojo/LeanCopilot"},{"id":"http://arxiv.org/abs/2410.20285v5","updated":"2025-03-02T19:42:45Z","published":"2024-10-26T22:45:56Z","title":"SWE-Search: Enhancing Software Agents with Monte Carlo Tree Search and\n Iterative Refinement","summary":" Software engineers operating in complex and dynamic environments must\ncontinuously adapt to evolving requirements, learn iteratively from experience,\nand reconsider their approaches based on new insights. However, current large\nlanguage model (LLM)-based software agents often follow linear, sequential\nprocesses that prevent backtracking and exploration of alternative solutions,\nlimiting their ability to rethink their strategies when initial approaches\nprove ineffective. To address these challenges, we propose SWE-Search, a\nmulti-agent framework that integrates Monte Carlo Tree Search (MCTS) with a\nself-improvement mechanism to enhance software agents' performance on\nrepository-level software tasks. SWE-Search extends traditional MCTS by\nincorporating a hybrid value function that leverages LLMs for both numerical\nvalue estimation and qualitative evaluation. This enables self-feedback loops\nwhere agents iteratively refine their strategies based on both quantitative\nnumerical evaluations and qualitative natural language assessments of pursued\ntrajectories. The framework includes a SWE-Agent for adaptive exploration, a\nValue Agent for iterative feedback, and a Discriminator Agent that facilitates\nmulti-agent debate for collaborative decision-making. Applied to the SWE-bench\nbenchmark, our approach demonstrates a 23% relative improvement in performance\nacross five models compared to standard open-source agents without MCTS. Our\nanalysis reveals how performance scales with increased inference-time compute\nthrough deeper search, providing a pathway to improve software agents without\nrequiring larger models or additional training data. This highlights the\npotential of self-evaluation driven search techniques in complex software\nengineering environments.\n","authors":["Antonis Antoniades","Albert Örwall","Kexun Zhang","Yuxi Xie","Anirudh Goyal","William Wang"],"pdf_url":"https://arxiv.org/pdf/2410.20285v5.pdf","comment":"Main body: 10 pages, 5 figures. Appendix: 5 pages, 4 figures.\n Open-source codebase"},{"id":"http://arxiv.org/abs/2405.14105v4","updated":"2025-03-02T18:24:29Z","published":"2024-05-23T02:14:17Z","title":"Distributed Speculative Inference (DSI): Speculation Parallelism for\n Provably Faster Lossless Language Model Inference","summary":" This paper introduces distributed speculative inference (DSI), a novel\ninference algorithm that is provably faster than speculative inference (SI)\n[leviathan2023, chen2023, miao2024, sun2025, timor2025] and standard\nautoregressive inference (non-SI). Like other SI algorithms, DSI operates on\nfrozen language models (LMs), requiring no training or architectural\nmodifications, and it preserves the target distribution. Prior studies on SI\nhave demonstrated empirical speedups over non-SI--but rely on sufficiently fast\nand accurate drafters, which are often unavailable in practice. We identify a\ngap where SI can be slower than non-SI if drafters are too slow or inaccurate.\nWe close this gap by proving that DSI is faster than both SI and non-SI--given\nany drafters. DSI is therefore not only faster than SI, but also unlocks the\nacceleration of LMs for which SI fails. DSI leverages speculation parallelism\n(SP), a novel type of task parallelism, to orchestrate target and drafter\ninstances that overlap in time, establishing a new foundational tradeoff\nbetween computational resources and latency. Our simulations show that DSI is\n1.29-1.92x faster than SI in single-node setups for various off-the-shelf LMs\nand tasks. We open-source all our code.\n","authors":["Nadav Timor","Jonathan Mamou","Daniel Korat","Moshe Berchansky","Oren Pereg","Moshe Wasserblat","Tomer Galanti","Michal Gordon","David Harel"],"pdf_url":"https://arxiv.org/pdf/2405.14105v4.pdf","comment":"Published at ICLR 2025. (Link:\n https://openreview.net/forum?id=cJd1BgZ9CS)"},{"id":"http://arxiv.org/abs/2407.13929v2","updated":"2025-03-02T18:17:11Z","published":"2024-07-18T22:33:52Z","title":"Unmasking Social Bots: How Confident Are We?","summary":" Social bots remain a major vector for spreading disinformation on social\nmedia and a menace to the public. Despite the progress made in developing\nmultiple sophisticated social bot detection algorithms and tools, bot detection\nremains a challenging, unsolved problem that is fraught with uncertainty due to\nthe heterogeneity of bot behaviors, training data, and detection algorithms.\nDetection models often disagree on whether to label the same account as bot or\nhuman-controlled. However, they do not provide any measure of uncertainty to\nindicate how much we should trust their results. We propose to address both bot\ndetection and the quantification of uncertainty at the account level - a novel\nfeature of this research. This dual focus is crucial as it allows us to\nleverage additional information related to the quantified uncertainty of each\nprediction, thereby enhancing decision-making and improving the reliability of\nbot classifications. Specifically, our approach facilitates targeted\ninterventions for bots when predictions are made with high confidence and\nsuggests caution (e.g., gathering more data) when predictions are uncertain.\n","authors":["James Giroux","Ariyarathne Gangani","Alexander C. Nwala","Cristiano Fanelli"],"pdf_url":"https://arxiv.org/pdf/2407.13929v2.pdf","comment":"15 pages, 6 figures, 6 tables"},{"id":"http://arxiv.org/abs/2408.08531v2","updated":"2025-03-02T18:15:48Z","published":"2024-08-16T04:57:54Z","title":"Detecting Unsuccessful Students in Cybersecurity Exercises in Two\n Different Learning Environments","summary":" This full paper in the research track evaluates the usage of data logged from\ncybersecurity exercises in order to predict students who are potentially at\nrisk of performing poorly. Hands-on exercises are essential for learning since\nthey enable students to practice their skills. In cybersecurity, hands-on\nexercises are often complex and require knowledge of many topics. Therefore,\nstudents may miss solutions due to gaps in their knowledge and become\nfrustrated, which impedes their learning. Targeted aid by the instructor helps,\nbut since the instructor's time is limited, efficient ways to detect struggling\nstudents are needed. This paper develops automated tools to predict when a\nstudent is having difficulty. We formed a dataset with the actions of 313\nstudents from two countries and two learning environments: KYPO CRP and\nEDURange. These data are used in machine learning algorithms to predict the\nsuccess of students in exercises deployed in these environments. After\nextracting features from the data, we trained and cross-validated eight\nclassifiers for predicting the exercise outcome and evaluated their predictive\npower. The contribution of this paper is comparing two approaches to feature\nengineering, modeling, and classification performance on data from two learning\nenvironments. Using the features from either learning environment, we were able\nto detect and distinguish between successful and struggling students. A\ndecision tree classifier achieved the highest balanced accuracy and sensitivity\nwith data from both learning environments. The results show that activity data\nfrom cybersecurity exercises are suitable for predicting student success. In a\npotential application, such models can aid instructors in detecting struggling\nstudents and providing targeted help. We publish data and code for building\nthese models so that others can adopt or adapt them.\n","authors":["Valdemar Švábenský","Kristián Tkáčik","Aubrey Birdwell","Richard Weiss","Ryan S. Baker","Pavel Čeleda","Jan Vykopal","Jens Mache","Ankur Chattopadhyay"],"pdf_url":"https://arxiv.org/pdf/2408.08531v2.pdf","comment":"Published in the FIE 2024 conference proceedings, see\n https://doi.org/10.1109/FIE61694.2024.10893135"},{"id":"http://arxiv.org/abs/2410.11112v5","updated":"2025-03-02T17:48:06Z","published":"2024-10-14T21:43:48Z","title":"Differentiable Weightless Neural Networks","summary":" We introduce the Differentiable Weightless Neural Network (DWN), a model\nbased on interconnected lookup tables. Training of DWNs is enabled by a novel\nExtended Finite Difference technique for approximate differentiation of binary\nvalues. We propose Learnable Mapping, Learnable Reduction, and Spectral\nRegularization to further improve the accuracy and efficiency of these models.\nWe evaluate DWNs in three edge computing contexts: (1) an FPGA-based hardware\naccelerator, where they demonstrate superior latency, throughput, energy\nefficiency, and model area compared to state-of-the-art solutions, (2) a\nlow-power microcontroller, where they achieve preferable accuracy to XGBoost\nwhile subject to stringent memory constraints, and (3) ultra-low-cost chips,\nwhere they consistently outperform small models in both accuracy and projected\nhardware area. DWNs also compare favorably against leading approaches for\ntabular datasets, with higher average rank. Overall, our work positions DWNs as\na pioneering solution for edge-compatible high-throughput neural networks.\n","authors":["Alan T. L. Bacellar","Zachary Susskind","Mauricio Breternitz Jr.","Eugene John","Lizy K. John","Priscila M. V. Lima","Felipe M. G. França"],"pdf_url":"https://arxiv.org/pdf/2410.11112v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08743v2","updated":"2025-03-02T17:33:03Z","published":"2024-03-13T17:46:28Z","title":"Prompting Fairness: Integrating Causality to Debias Large Language\n Models","summary":" Large language models (LLMs), despite their remarkable capabilities, are\nsusceptible to generating biased and discriminatory responses. As LLMs\nincreasingly influence high-stakes decision-making (e.g., hiring and\nhealthcare), mitigating these biases becomes critical. In this work, we propose\na causality-guided debiasing framework to tackle social biases, aiming to\nreduce the objectionable dependence between LLMs' decisions and the social\ninformation in the input. Our framework introduces a novel perspective to\nidentify how social information can affect an LLM's decision through different\ncausal pathways. Leveraging these causal insights, we outline principled\nprompting strategies that regulate these pathways through selection mechanisms.\nThis framework not only unifies existing prompting-based debiasing techniques,\nbut also opens up new directions for reducing bias by encouraging the model to\nprioritize fact-based reasoning over reliance on biased social cues. We\nvalidate our framework through extensive experiments on real-world datasets\nacross multiple domains, demonstrating its effectiveness in debiasing LLM\ndecisions, even with only black-box access to the model.\n","authors":["Jingling Li","Zeyu Tang","Xiaoyu Liu","Peter Spirtes","Kun Zhang","Liu Leqi","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2403.08743v2.pdf","comment":"24 pages, 10 figures"},{"id":"http://arxiv.org/abs/2409.14876v3","updated":"2025-03-02T17:27:04Z","published":"2024-09-23T10:17:13Z","title":"Tri-Clustering: A Multi-views Tri-level Information Fusion Context\n Clustering Framework for Localization and Classification in Mammography","summary":" Breast cancer is a significant global health issue, and the diagnosis of\nbreast imaging has always been challenging. Mammography images typically have\nextremely high resolution, with lesions occupying only a very small area.\nDown-sampling in neural networks can easily lead to the loss of\nmicrocalcifications or subtle structures, making it difficult for traditional\nneural network architectures to address these issues. To tackle these\nchallenges, we propose a Context Clustering Network with triple information\nfusion. Firstly, compared to CNNs or transformers, we find that Context\nclustering methods (1) are more computationally efficient and (2) can more\neasily associate structural or pathological features, making them suitable for\nthe clinical tasks of mammography. Secondly, we propose a triple information\nfusion mechanism that integrates global information, feature-based local\ninformation, and patch-based local information. The proposed approach is\nrigorously evaluated on two public datasets, Vindr-Mammo and CBIS-DDSM, using\nfive independent splits to ensure statistical robustness. Our method achieves\nan AUC of 0.828 on Vindr-Mammo and 0.805 on CBIS-DDSM, outperforming the next\nbest method by 3.1% and 2.4%, respectively. These improvements are\nstatistically significant (p<0.05), underscoring the benefits of Context\nClustering Network with triple information fusion. Overall, our Context\nClustering framework demonstrates strong potential as a scalable and\ncost-effective solution for large-scale mammography screening, enabling more\nefficient and accurate breast cancer detection. Access to our method is\navailable at https://github.com/Sohyu1/Mammo_Clustering.\n","authors":["Shilong Yang","Chulong Zhang","Qi Zang","Juan Yu","Liang Zeng","Xiao Luo","Yexuan Xing","Xin Pan","Qi Li","Xiaokun Liang","Yaoqin Xie"],"pdf_url":"https://arxiv.org/pdf/2409.14876v3.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.09036v2","updated":"2025-03-02T17:15:30Z","published":"2023-08-17T15:17:49Z","title":"Synthesizing Physically Plausible Human Motions in 3D Scenes","summary":" We present a physics-based character control framework for synthesizing\nhuman-scene interactions. Recent advances adopt physics simulation to mitigate\nartifacts produced by data-driven kinematic approaches. However, existing\nphysics-based methods mainly focus on single-object environments, resulting in\nlimited applicability in realistic 3D scenes with multi-objects. To address\nsuch challenges, we propose a framework that enables physically simulated\ncharacters to perform long-term interaction tasks in diverse, cluttered, and\nunseen 3D scenes. The key idea is to decouple human-scene interactions into two\nfundamental processes, Interacting and Navigating, which motivates us to\nconstruct two reusable Controllers, namely InterCon and NavCon. Specifically,\nInterCon uses two complementary policies to enable characters to enter or leave\nthe interacting state with a particular object (e.g., sitting on a chair or\ngetting up). To realize navigation in cluttered environments, we introduce\nNavCon, where a trajectory following policy enables characters to track\npre-planned collision-free paths. Benefiting from the divide and conquer\nstrategy, we can train all policies in simple environments and directly apply\nthem in complex multi-object scenes through coordination from a rule-based\nscheduler. Video and code are available at\nhttps://github.com/liangpan99/InterScene.\n","authors":["Liang Pan","Jingbo Wang","Buzhen Huang","Junyu Zhang","Haofan Wang","Xu Tang","Yangang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.09036v2.pdf","comment":"3DV 2024 version"},{"id":"http://arxiv.org/abs/2502.11882v3","updated":"2025-03-02T17:15:11Z","published":"2025-02-17T15:09:45Z","title":"Leveraging Dual Process Theory in Language Agent Framework for Real-time\n Simultaneous Human-AI Collaboration","summary":" Agents built on large language models (LLMs) have excelled in turn-by-turn\nhuman-AI collaboration but struggle with simultaneous tasks requiring real-time\ninteraction. Latency issues and the challenge of inferring variable human\nstrategies hinder their ability to make autonomous decisions without explicit\ninstructions. Through experiments with current independent System 1 and System\n2 methods, we validate the necessity of using Dual Process Theory (DPT) in\nreal-time tasks. We propose DPT-Agent, a novel language agent framework that\nintegrates System 1 and System 2 for efficient real-time simultaneous human-AI\ncollaboration. DPT-Agent's System 1 uses a Finite-state Machine (FSM) and\ncode-as-policy for fast, intuitive, and controllable decision-making.\nDPT-Agent's System 2 integrates Theory of Mind (ToM) and asynchronous\nreflection to infer human intentions and perform reasoning-based autonomous\ndecisions. We demonstrate the effectiveness of DPT-Agent through further\nexperiments with rule-based agents and human collaborators, showing significant\nimprovements over mainstream LLM-based frameworks. DPT-Agent can effectively\nhelp LLMs convert correct slow thinking and reasoning into executable actions,\nthereby improving performance. To the best of our knowledge, DPT-Agent is the\nfirst language agent framework that achieves successful real-time simultaneous\nhuman-AI collaboration autonomously. Code of DPT-Agent can be found in\nhttps://github.com/sjtu-marl/DPT-Agent.\n","authors":["Shao Zhang","Xihuai Wang","Wenhao Zhang","Chaoran Li","Junru Song","Tingyu Li","Lin Qiu","Xuezhi Cao","Xunliang Cai","Wen Yao","Weinan Zhang","Xinbing Wang","Ying Wen"],"pdf_url":"https://arxiv.org/pdf/2502.11882v3.pdf","comment":"Preprint under review. Update the experimental results of the\n DeepSeek-R1 series models, o3-mini-high and o3-mini-medium"},{"id":"http://arxiv.org/abs/2410.15744v2","updated":"2025-03-02T16:58:17Z","published":"2024-10-21T08:01:58Z","title":"Unleashing the Potential of Vision-Language Pre-Training for 3D\n Zero-Shot Lesion Segmentation via Mask-Attribute Alignment","summary":" Recent advancements in medical vision-language pre-training models have\ndriven significant progress in zero-shot disease recognition. However,\ntransferring image-level knowledge to pixel-level tasks, such as lesion\nsegmentation in 3D CT scans, remains a critical challenge. Due to the\ncomplexity and variability of pathological visual characteristics, existing\nmethods struggle to align fine-grained lesion features not encountered during\ntraining with disease-related textual representations. In this paper, we\npresent Malenia, a novel multi-scale lesion-level mask-attribute alignment\nframework, specifically designed for 3D zero-shot lesion segmentation. Malenia\nimproves the compatibility between mask representations and their associated\nelemental attributes, explicitly linking the visual features of unseen lesions\nwith the extensible knowledge learned from previously seen ones. Furthermore,\nwe design a Cross-Modal Knowledge Injection module to enhance both visual and\ntextual features with mutually beneficial information, effectively guiding the\ngeneration of segmentation results. Comprehensive experiments across three\ndatasets and 12 lesion categories validate the superior performance of Malenia.\n","authors":["Yankai Jiang","Wenhui Lei","Xiaofan Zhang","Shaoting Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.15744v2.pdf","comment":"Accepted as ICLR 2025 conference paper"},{"id":"http://arxiv.org/abs/2501.07468v3","updated":"2025-03-02T16:57:11Z","published":"2025-01-13T16:35:52Z","title":"From Screens to Scenes: A Survey of Embodied AI in Healthcare","summary":" Healthcare systems worldwide face persistent challenges in efficiency,\naccessibility, and personalization. Powered by modern AI technologies such as\nmultimodal large language models and world models, Embodied AI (EmAI)\nrepresents a transformative frontier, offering enhanced autonomy and the\nability to interact with the physical world to address these challenges. As an\ninterdisciplinary and rapidly evolving research domain, \"EmAI in healthcare\"\nspans diverse fields such as algorithms, robotics, and biomedicine. This\ncomplexity underscores the importance of timely reviews and analyses to track\nadvancements, address challenges, and foster cross-disciplinary collaboration.\nIn this paper, we provide a comprehensive overview of the \"brain\" of EmAI for\nhealthcare, wherein we introduce foundational AI algorithms for perception,\nactuation, planning, and memory, and focus on presenting the healthcare\napplications spanning clinical interventions, daily care & companionship,\ninfrastructure support, and biomedical research. Despite its promise, the\ndevelopment of EmAI for healthcare is hindered by critical challenges such as\nsafety concerns, gaps between simulation platforms and real-world applications,\nthe absence of standardized benchmarks, and uneven progress across\ninterdisciplinary domains. We discuss the technical barriers and explore\nethical considerations, offering a forward-looking perspective on the future of\nEmAI in healthcare. A hierarchical framework of intelligent levels for EmAI\nsystems is also introduced to guide further development. By providing\nsystematic insights, this work aims to inspire innovation and practical\napplications, paving the way for a new era of intelligent, patient-centered\nhealthcare.\n","authors":["Yihao Liu","Xu Cao","Tingting Chen","Yankai Jiang","Junjie You","Minghua Wu","Xiaosong Wang","Mengling Feng","Yaochu Jin","Jintai Chen"],"pdf_url":"https://arxiv.org/pdf/2501.07468v3.pdf","comment":"56 pages, 11 figures, manuscript accepted by Information Fusion"},{"id":"http://arxiv.org/abs/2410.08899v2","updated":"2025-03-02T16:12:10Z","published":"2024-10-11T15:18:48Z","title":"Utilizing ChatGPT in a Data Structures and Algorithms Course: A Teaching\n Assistant's Perspective","summary":" Integrating large language models (LLMs) like ChatGPT into computer science\neducation offers transformative potential for complex courses such as data\nstructures and algorithms (DSA). This study examines ChatGPT as a supplementary\ntool for teaching assistants (TAs), guided by structured prompts and human\noversight, to enhance instruction and student outcomes. A controlled experiment\ncompared traditional TA-led instruction with a hybrid approach where TAs used\nChatGPT-4o and ChatGPT o1 to generate exercises, clarify concepts, and provide\nfeedback. Structured prompts emphasized problem decomposition, real-world\ncontext, and code examples, enabling tailored support while mitigating\nover-reliance on AI. Results demonstrated the hybrid approach's efficacy, with\nstudents in the ChatGPT-assisted group scoring 16.50 points higher on average\nand excelling in advanced topics. However, ChatGPT's limitations necessitated\nTA verification. This framework highlights the dual role of LLMs: augmenting TA\nefficiency while ensuring accuracy through human oversight, offering a scalable\nsolution for human-AI collaboration in education.\n","authors":["Pooriya Jamie","Reyhaneh Hajihashemi","Sharareh Alipour"],"pdf_url":"https://arxiv.org/pdf/2410.08899v2.pdf","comment":"Accepted at CHI EA '25 (Extended Abstracts of the CHI Conference on\n Human Factors in Computing Systems, 2025). The final version is available at\n the External DOI"},{"id":"http://arxiv.org/abs/2412.20138v5","updated":"2025-03-02T15:57:39Z","published":"2024-12-28T12:54:06Z","title":"TradingAgents: Multi-Agents LLM Financial Trading Framework","summary":" Significant progress has been made in automated problem-solving using\nsocieties of agents powered by large language models (LLMs). In finance,\nefforts have largely focused on single-agent systems handling specific tasks or\nmulti-agent frameworks independently gathering data. However, multi-agent\nsystems' potential to replicate real-world trading firms' collaborative\ndynamics remains underexplored. TradingAgents proposes a novel stock trading\nframework inspired by trading firms, featuring LLM-powered agents in\nspecialized roles such as fundamental analysts, sentiment analysts, technical\nanalysts, and traders with varied risk profiles. The framework includes Bull\nand Bear researcher agents assessing market conditions, a risk management team\nmonitoring exposure, and traders synthesizing insights from debates and\nhistorical data to make informed decisions. By simulating a dynamic,\ncollaborative trading environment, this framework aims to improve trading\nperformance. Detailed architecture and extensive experiments reveal its\nsuperiority over baseline models, with notable improvements in cumulative\nreturns, Sharpe ratio, and maximum drawdown, highlighting the potential of\nmulti-agent LLM frameworks in financial trading. TradingAgents is available at\nhttps://github.com/PioneerFintech.\n","authors":["Yijia Xiao","Edward Sun","Di Luo","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2412.20138v5.pdf","comment":"Multi-Agent AI in the Real World @ AAAI 2025"},{"id":"http://arxiv.org/abs/2501.03895v2","updated":"2025-03-02T15:55:07Z","published":"2025-01-07T16:03:14Z","title":"LLaVA-Mini: Efficient Image and Video Large Multimodal Models with One\n Vision Token","summary":" The advent of real-time large multimodal models (LMMs) like GPT-4o has\nsparked considerable interest in efficient LMMs. LMM frameworks typically\nencode visual inputs into vision tokens (continuous representations) and\nintegrate them and textual instructions into the context of large language\nmodels (LLMs), where large-scale parameters and numerous context tokens\n(predominantly vision tokens) result in substantial computational overhead.\nPrevious efforts towards efficient LMMs always focus on replacing the LLM\nbackbone with smaller models, while neglecting the crucial issue of token\nquantity. In this paper, we introduce LLaVA-Mini, an efficient LMM with minimal\nvision tokens. To achieve a high compression ratio of vision tokens while\npreserving visual information, we first analyze how LMMs understand vision\ntokens and find that most vision tokens only play a crucial role in the early\nlayers of LLM backbone, where they mainly fuse visual information into text\ntokens. Building on this finding, LLaVA-Mini introduces modality pre-fusion to\nfuse visual information into text tokens in advance, thereby facilitating the\nextreme compression of vision tokens fed to LLM backbone into one token.\nLLaVA-Mini is a unified large multimodal model that can support the\nunderstanding of images, high-resolution images, and videos in an efficient\nmanner. Experiments across 11 image-based and 7 video-based benchmarks\ndemonstrate that LLaVA-Mini outperforms LLaVA-v1.5 with just 1 vision token\ninstead of 576. Efficiency analyses reveal that LLaVA-Mini can reduce FLOPs by\n77%, deliver low-latency responses within 40 milliseconds, and process over\n10,000 frames of video on the GPU hardware with 24GB of memory.\n","authors":["Shaolei Zhang","Qingkai Fang","Zhe Yang","Yang Feng"],"pdf_url":"https://arxiv.org/pdf/2501.03895v2.pdf","comment":"Accepted to ICLR 2025. Code: https://github.com/ictnlp/LLaVA-Mini\n Model: https://huggingface.co/ICTNLP/llava-mini-llama-3.1-8b"},{"id":"http://arxiv.org/abs/2412.12164v2","updated":"2025-03-02T15:12:38Z","published":"2024-12-11T19:12:22Z","title":"GAMED: Knowledge Adaptive Multi-Experts Decoupling for Multimodal Fake\n News Detection","summary":" Multimodal fake news detection often involves modelling heterogeneous data\nsources, such as vision and language. Existing detection methods typically rely\non fusion effectiveness and cross-modal consistency to model the content,\ncomplicating understanding how each modality affects prediction accuracy.\nAdditionally, these methods are primarily based on static feature modelling,\nmaking it difficult to adapt to the dynamic changes and relationships between\ndifferent data modalities. This paper develops a significantly novel approach,\nGAMED, for multimodal modelling, which focuses on generating distinctive and\ndiscriminative features through modal decoupling to enhance cross-modal\nsynergies, thereby optimizing overall performance in the detection process.\nGAMED leverages multiple parallel expert networks to refine features and\npre-embed semantic knowledge to improve the experts' ability in information\nselection and viewpoint sharing. Subsequently, the feature distribution of each\nmodality is adaptively adjusted based on the respective experts' opinions.\nGAMED also introduces a novel classification technique to dynamically manage\ncontributions from different modalities, while improving the explainability of\ndecisions. Experimental results on the Fakeddit and Yang datasets demonstrate\nthat GAMED performs better than recently developed state-of-the-art models. The\nsource code can be accessed at https://github.com/slz0925/GAMED.\n","authors":["Lingzhi Shen","Yunfei Long","Xiaohao Cai","Imran Razzak","Guanming Chen","Kang Liu","Shoaib Jameel"],"pdf_url":"https://arxiv.org/pdf/2412.12164v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04139v3","updated":"2025-03-02T14:52:21Z","published":"2024-12-05T13:06:03Z","title":"Monet: Mixture of Monosemantic Experts for Transformers","summary":" Understanding the internal computations of large language models (LLMs) is\ncrucial for aligning them with human values and preventing undesirable\nbehaviors like toxic content generation. However, mechanistic interpretability\nis hindered by polysemanticity -- where individual neurons respond to multiple,\nunrelated concepts. While Sparse Autoencoders (SAEs) have attempted to\ndisentangle these features through sparse dictionary learning, they have\ncompromised LLM performance due to reliance on post-hoc reconstruction loss. To\naddress this issue, we introduce Mixture of Monosemantic Experts for\nTransformers (Monet) architecture, which incorporates sparse dictionary\nlearning directly into end-to-end Mixture-of-Experts pretraining. Our novel\nexpert decomposition method enables scaling the expert count to 262,144 per\nlayer while total parameters scale proportionally to the square root of the\nnumber of experts. Our analyses demonstrate mutual exclusivity of knowledge\nacross experts and showcase the parametric knowledge encapsulated within\nindividual experts. Moreover, Monet allows knowledge manipulation over domains,\nlanguages, and toxicity mitigation without degrading general performance. Our\npursuit of transparent LLMs highlights the potential of scaling expert counts\nto enhance mechanistic interpretability and directly resect the internal\nknowledge to fundamentally adjust model behavior. The source code and\npretrained checkpoints are available at https://github.com/dmis-lab/Monet.\n","authors":["Jungwoo Park","Young Jin Ahn","Kee-Eung Kim","Jaewoo Kang"],"pdf_url":"https://arxiv.org/pdf/2412.04139v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.10781v2","updated":"2025-03-02T14:37:53Z","published":"2024-10-14T17:50:28Z","title":"When Attention Sink Emerges in Language Models: An Empirical View","summary":" Language Models (LMs) assign significant attention to the first token, even\nif it is not semantically important, which is known as attention sink. This\nphenomenon has been widely adopted in applications such as streaming/long\ncontext generation, KV cache optimization, inference acceleration, model\nquantization, and others. Despite its widespread use, a deep understanding of\nattention sink in LMs is still lacking. In this work, we first demonstrate that\nattention sinks exist universally in LMs with various inputs, even in small\nmodels. Furthermore, attention sink is observed to emerge during the LM\npre-training, motivating us to investigate how optimization, data distribution,\nloss function, and model architecture in LM pre-training influence its\nemergence. We highlight that attention sink emerges after effective\noptimization on sufficient training data. The sink position is highly\ncorrelated with the loss function and data distribution. Most importantly, we\nfind that attention sink acts more like key biases, storing extra attention\nscores, which could be non-informative and not contribute to the value\ncomputation. We also observe that this phenomenon (at least partially) stems\nfrom tokens' inner dependence on attention scores as a result of softmax\nnormalization. After relaxing such dependence by replacing softmax attention\nwith other attention operations, such as sigmoid attention without\nnormalization, attention sinks do not emerge in LMs up to 1B parameters. The\ncode is available at https://github.com/sail-sg/Attention-Sink.\n","authors":["Xiangming Gu","Tianyu Pang","Chao Du","Qian Liu","Fengzhuo Zhang","Cunxiao Du","Ye Wang","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2410.10781v2.pdf","comment":"ICLR 2025 (Spotlight)"},{"id":"http://arxiv.org/abs/2409.13555v2","updated":"2025-03-02T14:36:29Z","published":"2024-09-20T14:56:33Z","title":"Generating Visual Stories with Grounded and Coreferent Characters","summary":" Characters are important in narratives. They move the plot forward, create\nemotional connections, and embody the story's themes. Visual storytelling\nmethods focus more on the plot and events relating to it, without building the\nnarrative around specific characters. As a result, the generated stories feel\ngeneric, with character mentions being absent, vague, or incorrect. To mitigate\nthese issues, we introduce the new task of character-centric story generation\nand present the first model capable of predicting visual stories with\nconsistently grounded and coreferent character mentions. Our model is finetuned\non a new dataset which we build on top of the widely used VIST benchmark.\nSpecifically, we develop an automated pipeline to enrich VIST with visual and\ntextual character coreference chains. We also propose new evaluation metrics to\nmeasure the richness of characters and coreference in stories. Experimental\nresults show that our model generates stories with recurring characters which\nare consistent and coreferent to larger extent compared to baselines and\nstate-of-the-art systems.\n","authors":["Danyang Liu","Mirella Lapata","Frank Keller"],"pdf_url":"https://arxiv.org/pdf/2409.13555v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.07137v2","updated":"2025-03-02T14:28:33Z","published":"2024-10-09T17:53:06Z","title":"Cheating Automatic LLM Benchmarks: Null Models Achieve High Win Rates","summary":" Automatic LLM benchmarks, such as AlpacaEval 2.0, Arena-Hard-Auto, and\nMT-Bench, have become popular for evaluating language models due to their\ncost-effectiveness and scalability compared to human evaluation. Achieving high\nwin rates on these benchmarks can significantly boost the promotional impact of\nnewly released language models. This promotional benefit may motivate tricks,\nsuch as manipulating model output length or style to game win rates, even\nthough several mechanisms have been developed to control length and disentangle\nstyle to reduce gameability. Nonetheless, we show that even a \"null model\" that\nalways outputs a constant response (irrelevant to input instructions) can cheat\nautomatic benchmarks and achieve top-ranked win rates: an 86.5% LC win rate on\nAlpacaEval 2.0; an 83.0 score on Arena-Hard-Auto; and a 9.55 score on MT-Bench.\nMoreover, the crafted cheating outputs are transferable because we assume that\nthe instructions of these benchmarks (e.g., 805 samples of AlpacaEval 2.0) are\nprivate and cannot be accessed. While our experiments are primarily\nproof-of-concept, an adversary could use LLMs to generate more imperceptible\ncheating responses, unethically benefiting from high win rates and promotional\nimpact. Our findings call for the development of anti-cheating mechanisms for\nreliable automatic benchmarks. The code is available at\nhttps://github.com/sail-sg/Cheating-LLM-Benchmarks.\n","authors":["Xiaosen Zheng","Tianyu Pang","Chao Du","Qian Liu","Jing Jiang","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2410.07137v2.pdf","comment":"ICLR 2025 (Oral)"},{"id":"http://arxiv.org/abs/2410.16699v2","updated":"2025-03-02T14:18:13Z","published":"2024-10-22T05:11:45Z","title":"Graph Transformers Dream of Electric Flow","summary":" We show theoretically and empirically that the linear Transformer, when\napplied to graph data, can implement algorithms that solve canonical problems\nsuch as electric flow and eigenvector decomposition. The Transformer has access\nto information on the input graph only via the graph's incidence matrix. We\npresent explicit weight configurations for implementing each algorithm, and we\nbound the constructed Transformers' errors by the errors of the underlying\nalgorithms. Our theoretical findings are corroborated by experiments on\nsynthetic data. Additionally, on a real-world molecular regression task, we\nobserve that the linear Transformer is capable of learning a more effective\npositional encoding than the default one based on Laplacian eigenvectors. Our\nwork is an initial step towards elucidating the inner-workings of the\nTransformer for graph data. Code is available at\nhttps://github.com/chengxiang/LinearGraphTransformer\n","authors":["Xiang Cheng","Lawrence Carin","Suvrit Sra"],"pdf_url":"https://arxiv.org/pdf/2410.16699v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02094v3","updated":"2025-03-02T14:04:22Z","published":"2024-10-02T23:30:05Z","title":"Tracking objects that change in appearance with phase synchrony","summary":" Objects we encounter often change appearance as we interact with them.\nChanges in illumination (shadows), object pose, or the movement of non-rigid\nobjects can drastically alter available image features. How do biological\nvisual systems track objects as they change? One plausible mechanism involves\nattentional mechanisms for reasoning about the locations of objects\nindependently of their appearances -- a capability that prominent neuroscience\ntheories have associated with computing through neural synchrony. Here, we\ndescribe a novel deep learning circuit that can learn to precisely control\nattention to features separately from their location in the world through\nneural synchrony: the complex-valued recurrent neural network (CV-RNN). Next,\nwe compare object tracking in humans, the CV-RNN, and other deep neural\nnetworks (DNNs), using FeatureTracker: a large-scale challenge that asks\nobservers to track objects as their locations and appearances change in\nprecisely controlled ways. While humans effortlessly solved FeatureTracker,\nstate-of-the-art DNNs did not. In contrast, our CV-RNN behaved similarly to\nhumans on the challenge, providing a computational proof-of-concept for the\nrole of phase synchronization as a neural substrate for tracking\nappearance-morphing objects as they move about.\n","authors":["Sabine Muzellec","Drew Linsley","Alekh K. Ashok","Ennio Mingolla","Girik Malik","Rufin VanRullen","Thomas Serre"],"pdf_url":"https://arxiv.org/pdf/2410.02094v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07575v4","updated":"2025-03-02T13:55:52Z","published":"2024-04-11T09:06:49Z","title":"An Effective Automated Speaking Assessment Approach to Mitigating Data\n Scarcity and Imbalanced Distribution","summary":" Automated speaking assessment (ASA) typically involves automatic speech\nrecognition (ASR) and hand-crafted feature extraction from the ASR transcript\nof a learner's speech. Recently, self-supervised learning (SSL) has shown\nstellar performance compared to traditional methods. However, SSL-based ASA\nsystems are faced with at least three data-related challenges: limited\nannotated data, uneven distribution of learner proficiency levels and\nnon-uniform score intervals between different CEFR proficiency levels. To\naddress these challenges, we explore the use of two novel modeling strategies:\nmetric-based classification and loss reweighting, leveraging distinct SSL-based\nembedding features. Extensive experimental results on the ICNALE benchmark\ndataset suggest that our approach can outperform existing strong baselines by a\nsizable margin, achieving a significant improvement of more than 10% in CEFR\nprediction accuracy.\n","authors":["Tien-Hong Lo","Fu-An Chao","Tzu-I Wu","Yao-Ting Sung","Berlin Chen"],"pdf_url":"https://arxiv.org/pdf/2404.07575v4.pdf","comment":"Accepted to NAACL 2024 Findings"},{"id":"http://arxiv.org/abs/2502.20209v2","updated":"2025-03-02T13:36:57Z","published":"2025-02-27T15:50:21Z","title":"DIPSER: A Dataset for In-Person Student Engagement Recognition in the\n Wild","summary":" In this paper, a novel dataset is introduced, designed to assess student\nattention within in-person classroom settings. This dataset encompasses RGB\ncamera data, featuring multiple cameras per student to capture both posture and\nfacial expressions, in addition to smartwatch sensor data for each individual.\nThis dataset allows machine learning algorithms to be trained to predict\nattention and correlate it with emotion. A comprehensive suite of attention and\nemotion labels for each student is provided, generated through self-reporting\nas well as evaluations by four different experts. Our dataset uniquely combines\nfacial and environmental camera data, smartwatch metrics, and includes\nunderrepresented ethnicities in similar datasets, all within in-the-wild,\nin-person settings, making it the most comprehensive dataset of its kind\ncurrently available.\n The dataset presented offers an extensive and diverse collection of data\npertaining to student interactions across different educational contexts,\naugmented with additional metadata from other tools. This initiative addresses\nexisting deficiencies by offering a valuable resource for the analysis of\nstudent attention and emotion in face-to-face lessons.\n","authors":["Luis Marquez-Carpintero","Sergio Suescun-Ferrandiz","Carolina Lorenzo Álvarez","Jorge Fernandez-Herrero","Diego Viejo","Rosabel Roig-Vila","Miguel Cazorla"],"pdf_url":"https://arxiv.org/pdf/2502.20209v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.02820v2","updated":"2025-03-02T13:14:11Z","published":"2022-08-04T02:22:29Z","title":"MOVE: Effective and Harmless Ownership Verification via Embedded\n External Features","summary":" Currently, deep neural networks (DNNs) are widely adopted in different\napplications. Despite its commercial values, training a well-performing DNN is\nresource-consuming. Accordingly, the well-trained model is valuable\nintellectual property for its owner. However, recent studies revealed the\nthreats of model stealing, where the adversaries can obtain a function-similar\ncopy of the victim model, even when they can only query the model. In this\npaper, we propose an effective and harmless model ownership verification (MOVE)\nto defend against different types of model stealing simultaneously, without\nintroducing new security risks. In general, we conduct the ownership\nverification by verifying whether a suspicious model contains the knowledge of\ndefender-specified external features. Specifically, we embed the external\nfeatures by modifying a few training samples with style transfer. We then train\na meta-classifier to determine whether a model is stolen from the victim. This\napproach is inspired by the understanding that the stolen models should contain\nthe knowledge of features learned by the victim model. In particular,\n\\revision{we develop our MOVE method under both white-box and black-box\nsettings and analyze its theoretical foundation to provide comprehensive model\nprotection.} Extensive experiments on benchmark datasets verify the\neffectiveness of our method and its resistance to potential adaptive attacks.\nThe codes for reproducing the main experiments of our method are available at\nhttps://github.com/THUYimingLi/MOVE.\n","authors":["Yiming Li","Linghui Zhu","Xiaojun Jia","Yang Bai","Yong Jiang","Shu-Tao Xia","Xiaochun Cao","Kui Ren"],"pdf_url":"https://arxiv.org/pdf/2208.02820v2.pdf","comment":"This paper has been accepted by IEEE TPAMI 2025. It is the journal\n extension of our conference paper in AAAI 2022\n (https://ojs.aaai.org/index.php/AAAI/article/view/20036). 18 pages"},{"id":"http://arxiv.org/abs/2312.08671v2","updated":"2025-03-02T13:13:42Z","published":"2023-12-14T06:08:35Z","title":"Permutation-Invariant Graph Partitioning:How Graph Neural Networks\n Capture Structural Interactions?","summary":" Graph Neural Networks (GNNs) have paved the way for being a cornerstone in\ngraph-related learning tasks. Yet, the ability of GNNs to capture structural\ninteractions within graphs remains under-explored. In this work, we address\nthis gap by drawing on the insight that permutation invariant graph\npartitioning enables a powerful way of exploring structural interactions. We\nestablish theoretical connections between permutation invariant graph\npartitioning and graph isomorphism, and then propose Graph Partitioning Neural\nNetworks (GPNNs), a novel architecture that efficiently enhances the expressive\npower of GNNs in learning structural interactions. We analyze how partitioning\nschemes and structural interactions contribute to GNN expressivity and their\ntrade-offs with complexity. Empirically, we demonstrate that GPNNs outperform\nexisting GNN models in capturing structural interactions across diverse graph\nbenchmark tasks.\n","authors":["Asela Hevapathige","Qing Wang"],"pdf_url":"https://arxiv.org/pdf/2312.08671v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01229v2","updated":"2025-03-02T12:27:07Z","published":"2024-05-02T12:18:14Z","title":"Boosting Jailbreak Attack with Momentum","summary":" Large Language Models (LLMs) have achieved remarkable success across diverse\ntasks, yet they remain vulnerable to adversarial attacks, notably the\nwell-known jailbreak attack. In particular, the Greedy Coordinate Gradient\n(GCG) attack has demonstrated efficacy in exploiting this vulnerability by\noptimizing adversarial prompts through a combination of gradient heuristics and\ngreedy search. However, the efficiency of this attack has become a bottleneck\nin the attacking process. To mitigate this limitation, in this paper we rethink\nthe generation of the adversarial prompts through an optimization lens, aiming\nto stabilize the optimization process and harness more heuristic insights from\nprevious optimization iterations. Specifically, we propose the\n\\textbf{M}omentum \\textbf{A}ccelerated G\\textbf{C}G (\\textbf{MAC}) attack,\nwhich integrates a momentum term into the gradient heuristic to boost and\nstabilize the random search for tokens in adversarial prompts. Experimental\nresults showcase the notable enhancement achieved by MAC over baselines in\nterms of attack success rate and optimization efficiency. Moreover, we\ndemonstrate that MAC can still exhibit superior performance for transfer\nattacks and models under defense mechanisms. Our code is available at\nhttps://github.com/weizeming/momentum-attack-llm.\n","authors":["Yihao Zhang","Zeming Wei"],"pdf_url":"https://arxiv.org/pdf/2405.01229v2.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2412.05994v2","updated":"2025-03-02T12:21:49Z","published":"2024-12-08T16:58:29Z","title":"PIG: Physics-Informed Gaussians as Adaptive Parametric Mesh\n Representations","summary":" The numerical approximation of partial differential equations (PDEs) using\nneural networks has seen significant advancements through Physics-Informed\nNeural Networks (PINNs). Despite their straightforward optimization framework\nand flexibility in implementing various PDEs, PINNs often suffer from limited\naccuracy due to the spectral bias of Multi-Layer Perceptrons (MLPs), which\nstruggle to effectively learn high-frequency and nonlinear components.\nRecently, parametric mesh representations in combination with neural networks\nhave been investigated as a promising approach to eliminate the inductive bias\nof MLPs. However, they usually require high-resolution grids and a large number\nof collocation points to achieve high accuracy while avoiding overfitting. In\naddition, the fixed positions of the mesh parameters restrict their\nflexibility, making accurate approximation of complex PDEs challenging. To\novercome these limitations, we propose Physics-Informed Gaussians (PIGs), which\ncombine feature embeddings using Gaussian functions with a lightweight neural\nnetwork. Our approach uses trainable parameters for the mean and variance of\neach Gaussian, allowing for dynamic adjustment of their positions and shapes\nduring training. This adaptability enables our model to optimally approximate\nPDE solutions, unlike models with fixed parameter positions. Furthermore, the\nproposed approach maintains the same optimization framework used in PINNs,\nallowing us to benefit from their excellent properties. Experimental results\nshow the competitive performance of our model across various PDEs,\ndemonstrating its potential as a robust tool for solving complex PDEs. Our\nproject page is available at\nhttps://namgyukang.github.io/Physics-Informed-Gaussians/\n","authors":["Namgyu Kang","Jaemin Oh","Youngjoon Hong","Eunbyung Park"],"pdf_url":"https://arxiv.org/pdf/2412.05994v2.pdf","comment":"Project page:\n https://namgyukang.github.io/Physics-Informed-Gaussians/"},{"id":"http://arxiv.org/abs/2411.02275v2","updated":"2025-03-02T11:48:40Z","published":"2024-11-04T17:05:37Z","title":"Breaking the Reclustering Barrier in Centroid-based Deep Clustering","summary":" This work investigates an important phenomenon in centroid-based deep\nclustering (DC) algorithms: Performance quickly saturates after a period of\nrapid early gains. Practitioners commonly address early saturation with\nperiodic reclustering, which we demonstrate to be insufficient to address\nperformance plateaus. We call this phenomenon the \"reclustering barrier\" and\nempirically show when the reclustering barrier occurs, what its underlying\nmechanisms are, and how it is possible to Break the Reclustering Barrier with\nour algorithm BRB. BRB avoids early over-commitment to initial clusterings and\nenables continuous adaptation to reinitialized clustering targets while\nremaining conceptually simple. Applying our algorithm to widely-used\ncentroid-based DC algorithms, we show that (1) BRB consistently improves\nperformance across a wide range of clustering benchmarks, (2) BRB enables\ntraining from scratch, and (3) BRB performs competitively against\nstate-of-the-art DC algorithms when combined with a contrastive loss. We\nrelease our code and pre-trained models at\nhttps://github.com/Probabilistic-and-Interactive-ML/breaking-the-reclustering-barrier .\n","authors":["Lukas Miklautz","Timo Klein","Kevin Sidak","Collin Leiber","Thomas Lang","Andrii Shkabrii","Sebastian Tschiatschek","Claudia Plant"],"pdf_url":"https://arxiv.org/pdf/2411.02275v2.pdf","comment":"Accepted at ICLR 2025 (Camera-ready version)"},{"id":"http://arxiv.org/abs/2407.05649v4","updated":"2025-03-02T11:37:49Z","published":"2024-07-08T06:21:56Z","title":"Greener GRASS: Enhancing GNNs with Encoding, Rewiring, and Attention","summary":" Graph Neural Networks (GNNs) have become important tools for machine learning\non graph-structured data. In this paper, we explore the synergistic combination\nof graph encoding, graph rewiring, and graph attention, by introducing Graph\nAttention with Stochastic Structures (GRASS), a novel GNN architecture. GRASS\nutilizes relative random walk probabilities (RRWP) encoding and a novel\ndecomposed variant (D-RRWP) to efficiently capture structural information. It\nrewires the input graph by superimposing a random regular graph to enhance\nlong-range information propagation. It also employs a novel additive attention\nmechanism tailored for graph-structured data. Our empirical evaluations\ndemonstrate that GRASS achieves state-of-the-art performance on multiple\nbenchmark datasets, including a 20.3% reduction in mean absolute error on the\nZINC dataset.\n","authors":["Tongzhou Liao","Barnabás Póczos"],"pdf_url":"https://arxiv.org/pdf/2407.05649v4.pdf","comment":"Published as a conference paper at ICLR 2025"},{"id":"http://arxiv.org/abs/2410.02242v2","updated":"2025-03-02T11:32:27Z","published":"2024-10-03T06:30:27Z","title":"Robust Weight Initialization for Tanh Neural Networks with Fixed Point\n Analysis","summary":" As a neural network's depth increases, it can improve generalization\nperformance. However, training deep networks is challenging due to gradient and\nsignal propagation issues. To address these challenges, extensive theoretical\nresearch and various methods have been introduced. Despite these advances,\neffective weight initialization methods for tanh neural networks remain\ninsufficiently investigated. This paper presents a novel weight initialization\nmethod for neural networks with tanh activation function. Based on an analysis\nof the fixed points of the function $\\tanh(ax)$, the proposed method aims to\ndetermine values of $a$ that mitigate activation saturation. A series of\nexperiments on various classification datasets and physics-informed neural\nnetworks demonstrates that the proposed method outperforms Xavier\ninitialization methods~(with or without normalization) in terms of robustness\nacross different network sizes, data efficiency, and convergence speed. Code is\navailable at https://github.com/1HyunwooLee/Tanh-Init\n","authors":["Hyunwoo Lee","Hayoung Choi","Hyunju Kim"],"pdf_url":"https://arxiv.org/pdf/2410.02242v2.pdf","comment":"ICLR 2025"},{"id":"http://arxiv.org/abs/2501.15445v2","updated":"2025-03-02T11:16:08Z","published":"2025-01-26T08:22:44Z","title":"StochSync: Stochastic Diffusion Synchronization for Image Generation in\n Arbitrary Spaces","summary":" We propose a zero-shot method for generating images in arbitrary spaces\n(e.g., a sphere for 360{\\deg} panoramas and a mesh surface for texture) using a\npretrained image diffusion model. The zero-shot generation of various visual\ncontent using a pretrained image diffusion model has been explored mainly in\ntwo directions. First, Diffusion Synchronization-performing reverse diffusion\nprocesses jointly across different projected spaces while synchronizing them in\nthe target space-generates high-quality outputs when enough conditioning is\nprovided, but it struggles in its absence. Second, Score Distillation\nSampling-gradually updating the target space data through gradient\ndescent-results in better coherence but often lacks detail. In this paper, we\nreveal for the first time the interconnection between these two methods while\nhighlighting their differences. To this end, we propose StochSync, a novel\napproach that combines the strengths of both, enabling effective performance\nwith weak conditioning. Our experiments demonstrate that StochSync provides the\nbest performance in 360{\\deg} panorama generation (where image conditioning is\nnot given), outperforming previous finetuning-based methods, and also delivers\ncomparable results in 3D mesh texturing (where depth conditioning is provided)\nwith previous methods.\n","authors":["Kyeongmin Yeo","Jaihoon Kim","Minhyuk Sung"],"pdf_url":"https://arxiv.org/pdf/2501.15445v2.pdf","comment":"Project page: https://stochsync.github.io/ (ICLR 2025)"},{"id":"http://arxiv.org/abs/2402.05569v5","updated":"2025-03-02T10:48:32Z","published":"2024-02-08T11:10:39Z","title":"Training-Free Message Passing for Learning on Hypergraphs","summary":" Hypergraphs are crucial for modelling higher-order interactions in real-world\ndata. Hypergraph neural networks (HNNs) effectively utilise these structures by\nmessage passing to generate informative node features for various downstream\ntasks like node classification. However, the message passing module in existing\nHNNs typically requires a computationally intensive training process, which\nlimits their practical use. To tackle this challenge, we propose an alternative\napproach by decoupling the usage of hypergraph structural information from the\nmodel learning stage. This leads to a novel training-free message passing\nmodule, named TF-MP-Module, which can be precomputed in the data preprocessing\nstage, thereby reducing the computational burden. We refer to the hypergraph\nneural network equipped with our TF-MP-Module as TF-HNN. We theoretically\nsupport the efficiency and effectiveness of TF-HNN by showing that: 1) It is\nmore training-efficient compared to existing HNNs; 2) It utilises as much\ninformation as existing HNNs for node feature generation; and 3) It is robust\nagainst the oversmoothing issue while using long-range interactions.\nExperiments based on seven real-world hypergraph benchmarks in node\nclassification and hyperlink prediction show that, compared to state-of-the-art\nHNNs, TF-HNN exhibits both competitive performance and superior training\nefficiency. Specifically, on the large-scale benchmark, Trivago, TF-HNN\noutperforms the node classification accuracy of the best baseline by 10% with\njust 1% of the training time of that baseline.\n","authors":["Bohan Tang","Zexi Liu","Keyue Jiang","Siheng Chen","Xiaowen Dong"],"pdf_url":"https://arxiv.org/pdf/2402.05569v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14666v2","updated":"2025-03-02T10:38:32Z","published":"2024-10-18T17:56:11Z","title":"DiscoGraMS: Enhancing Movie Screen-Play Summarization using Movie\n Character-Aware Discourse Graph","summary":" Summarizing movie screenplays presents a unique set of challenges compared to\nstandard document summarization. Screenplays are not only lengthy, but also\nfeature a complex interplay of characters, dialogues, and scenes, with numerous\ndirect and subtle relationships and contextual nuances that are difficult for\nmachine learning models to accurately capture and comprehend. Recent attempts\nat screenplay summarization focus on fine-tuning transformer-based pre-trained\nmodels, but these models often fall short in capturing long-term dependencies\nand latent relationships, and frequently encounter the \"lost in the middle\"\nissue. To address these challenges, we introduce DiscoGraMS, a novel resource\nthat represents movie scripts as a movie character-aware discourse graph (CaD\nGraph). This approach is well-suited for various downstream tasks, such as\nsummarization, question-answering, and salience detection. The model aims to\npreserve all salient information, offering a more comprehensive and faithful\nrepresentation of the screenplay's content. We further explore a baseline\nmethod that combines the CaD Graph with the corresponding movie script through\na late fusion of graph and text modalities, and we present very initial\npromising results.\n","authors":["Maitreya Prafulla Chitale","Uday Bindal","Rajakrishnan Rajkumar","Rahul Mishra"],"pdf_url":"https://arxiv.org/pdf/2410.14666v2.pdf","comment":"Accepted at NAACL 2025 (Main)"},{"id":"http://arxiv.org/abs/2411.15216v2","updated":"2025-03-02T10:23:51Z","published":"2024-11-20T16:17:40Z","title":"Dist Loss: Enhancing Regression in Few-Shot Region through Distribution\n Distance Constraint","summary":" Imbalanced data distributions are prevalent in real-world scenarios, posing\nsignificant challenges in both imbalanced classification and imbalanced\nregression tasks. They often cause deep learning models to overfit in areas of\nhigh sample density (many-shot regions) while underperforming in areas of low\nsample density (few-shot regions). This characteristic restricts the utility of\ndeep learning models in various sectors, notably healthcare, where areas with\nfew-shot data hold greater clinical relevance. While recent studies have shown\nthe benefits of incorporating distribution information in imbalanced\nclassification tasks, such strategies are rarely explored in imbalanced\nregression. In this paper, we address this issue by introducing a novel loss\nfunction, termed Dist Loss, designed to minimize the distribution distance\nbetween the model's predictions and the target labels in a differentiable\nmanner, effectively integrating distribution information into model training.\nDist Loss enables deep learning models to regularize their output distribution\nduring training, effectively enhancing their focus on few-shot regions. We have\nconducted extensive experiments across three datasets spanning computer vision\nand healthcare: IMDB-WIKI-DIR, AgeDB-DIR, and ECG-Ka-DIR. The results\ndemonstrate that Dist Loss effectively mitigates the negative impact of\nimbalanced data distribution on model performance, achieving state-of-the-art\nresults in sparse data regions. Furthermore, Dist Loss is easy to integrate,\ncomplementing existing methods.\n","authors":["Guangkun Nie","Gongzheng Tang","Shenda Hong"],"pdf_url":"https://arxiv.org/pdf/2411.15216v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.18176v2","updated":"2025-03-02T09:22:47Z","published":"2025-02-25T13:09:34Z","title":"CLIPure: Purification in Latent Space via CLIP for Adversarially Robust\n Zero-Shot Classification","summary":" In this paper, we aim to build an adversarially robust zero-shot image\nclassifier. We ground our work on CLIP, a vision-language pre-trained encoder\nmodel that can perform zero-shot classification by matching an image with text\nprompts ``a photo of a .''. Purification is the path we choose\nsince it does not require adversarial training on specific attack types and\nthus can cope with any foreseen attacks. We then formulate purification risk as\nthe KL divergence between the joint distributions of the purification process\nof denoising the adversarial samples and the attack process of adding\nperturbations to benign samples, through bidirectional Stochastic Differential\nEquations (SDEs). The final derived results inspire us to explore purification\nin the multi-modal latent space of CLIP. We propose two variants for our\nCLIPure approach: CLIPure-Diff which models the likelihood of images' latent\nvectors with the DiffusionPrior module in DaLLE-2 (modeling the generation\nprocess of CLIP's latent vectors), and CLIPure-Cos which models the likelihood\nwith the cosine similarity between the embeddings of an image and ``a photo of\na.''. As far as we know, CLIPure is the first purification method in\nmulti-modal latent space and CLIPure-Cos is the first purification method that\nis not based on generative models, which substantially improves defense\nefficiency. We conducted extensive experiments on CIFAR-10, ImageNet, and 13\ndatasets that previous CLIP-based defense methods used for evaluating zero-shot\nclassification robustness. Results show that CLIPure boosts the SOTA robustness\nby a large margin, e.g., from 71.7% to 91.1% on CIFAR10, from 59.6% to 72.6% on\nImageNet, and 108% relative improvements of average robustness on the 13\ndatasets over previous SOTA. The code is available at\nhttps://github.com/TMLResearchGroup-CAS/CLIPure.\n","authors":["Mingkun Zhang","Keping Bi","Wei Chen","Jiafeng Guo","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2502.18176v2.pdf","comment":"accepted by ICLR 2025"},{"id":"http://arxiv.org/abs/2409.01281v2","updated":"2025-03-02T09:13:56Z","published":"2024-08-25T01:45:53Z","title":"Path-Consistency: Prefix Enhancement for Efficient Inference in LLM","summary":" To enhance the reasoning capabilities of large language models (LLMs),\nself-consistency has gained significant popularity by combining multiple\nsampling with majority voting. However, the state-of-the-art self-consistency\napproaches consume substantial computational resources and lead to significant\nadditional time costs due to the multiple sampling. This prevents its full\npotential from being realized in scenarios where computational resources are\ncritical. To improve the inference efficiency, this paper introduces\n\\textit{path-consistency}, a method that leverages the confidence of answers\ngenerated in earlier branches to identify the prefix of the most promising\npath. By dynamically guiding the generation of subsequent branches based on\nthis prefix, the \\textit{path-consistency} mitigates both the errors and\nredundancies from random or less useful sampling in self-consistency. As a\nresult, it can significantly accelerate the inference process by reducing the\nnumber of tokens generated. Our extensive empirical evaluation shows that the\n\\textit{path-consistency} achieves significant acceleration in inference\nlatency ranging from $7.8\\%$ to $40.5\\%$, while maintaining or even improving\ntask accuracy across different datasets, including mathematical reasoning,\ncommon sense reasoning, symbolic reasoning, and code generation.\n","authors":["Jiace Zhu","Yingtao Shen","Jie Zhao","An Zou"],"pdf_url":"https://arxiv.org/pdf/2409.01281v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.06614v2","updated":"2025-03-02T08:59:29Z","published":"2024-10-09T07:09:46Z","title":"Pair-VPR: Place-Aware Pre-training and Contrastive Pair Classification\n for Visual Place Recognition with Vision Transformers","summary":" In this work we propose a novel joint training method for Visual Place\nRecognition (VPR), which simultaneously learns a global descriptor and a pair\nclassifier for re-ranking. The pair classifier can predict whether a given pair\nof images are from the same place or not. The network only comprises Vision\nTransformer components for both the encoder and the pair classifier, and both\ncomponents are trained using their respective class tokens. In existing VPR\nmethods, typically the network is initialized using pre-trained weights from a\ngeneric image dataset such as ImageNet. In this work we propose an alternative\npre-training strategy, by using Siamese Masked Image Modelling as a\npre-training task. We propose a Place-aware image sampling procedure from a\ncollection of large VPR datasets for pre-training our model, to learn visual\nfeatures tuned specifically for VPR. By re-using the Mask Image Modelling\nencoder and decoder weights in the second stage of training, Pair-VPR can\nachieve state-of-the-art VPR performance across five benchmark datasets with a\nViT-B encoder, along with further improvements in localization recall with\nlarger encoders. The Pair-VPR website is:\nhttps://csiro-robotics.github.io/Pair-VPR.\n","authors":["Stephen Hausler","Peyman Moghadam"],"pdf_url":"https://arxiv.org/pdf/2410.06614v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.07254v2","updated":"2025-03-02T08:56:31Z","published":"2025-02-11T04:42:00Z","title":"Fairness in Agentic AI: A Unified Framework for Ethical and Equitable\n Multi-Agent System","summary":" Ensuring fairness in decentralized multi-agent systems presents significant\nchallenges due to emergent biases, systemic inefficiencies, and conflicting\nagent incentives. This paper provides a comprehensive survey of fairness in\nmulti-agent AI, introducing a novel framework where fairness is treated as a\ndynamic, emergent property of agent interactions. The framework integrates\nfairness constraints, bias mitigation strategies, and incentive mechanisms to\nalign autonomous agent behaviors with societal values while balancing\nefficiency and robustness. Through empirical validation, we demonstrate that\nincorporating fairness constraints results in more equitable decision-making.\nThis work bridges the gap between AI ethics and system design, offering a\nfoundation for accountable, transparent, and socially responsible multi-agent\nAI systems.\n","authors":["Rajesh Ranjan","Shailja Gupta","Surya Narayan Singh"],"pdf_url":"https://arxiv.org/pdf/2502.07254v2.pdf","comment":"12 pages, 4 figures, 1 table"},{"id":"http://arxiv.org/abs/2502.20092v2","updated":"2025-03-02T08:56:15Z","published":"2025-02-27T13:51:56Z","title":"WalnutData: A UAV Remote Sensing Dataset of Green Walnuts and Model\n Evaluation","summary":" The UAV technology is gradually maturing and can provide extremely powerful\nsupport for smart agriculture and precise monitoring. Currently, there is no\ndataset related to green walnuts in the field of agricultural computer vision.\nThus, in order to promote the algorithm design in the field of agricultural\ncomputer vision, we used UAV to collect remote-sensing data from 8 walnut\nsample plots. Considering that green walnuts are subject to various lighting\nconditions and occlusion, we constructed a large-scale dataset with a\nhigher-granularity of target features - WalnutData. This dataset contains a\ntotal of 30,240 images and 706,208 instances, and there are 4 target\ncategories: being illuminated by frontal light and unoccluded (A1), being\nbacklit and unoccluded (A2), being illuminated by frontal light and occluded\n(B1), and being backlit and occluded (B2). Subsequently, we evaluated many\nmainstream algorithms on WalnutData and used these evaluation results as the\nbaseline standard. The dataset and all evaluation results can be obtained at\nhttps://github.com/1wuming/WalnutData.\n","authors":["Mingjie Wu","Chenggui Yang","Huihua Wang","Chen Xue","Yibo Wang","Haoyu Wang","Yansong Wang","Can Peng","Yuqi Han","Ruoyu Li","Lijun Yun","Zaiqing Chen","Songfan Shi","Luhao Fang","Shuyi Wan","Tingfeng Li","Shuangyao Liu","Haotian Feng"],"pdf_url":"https://arxiv.org/pdf/2502.20092v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.14808v2","updated":"2025-03-02T08:53:47Z","published":"2024-11-22T09:08:58Z","title":"High-Resolution Image Synthesis via Next-Token Prediction","summary":" Recently, autoregressive models have demonstrated remarkable performance in\nclass-conditional image generation. However, the application of next-token\nprediction to high-resolution text-to-image generation remains largely\nunexplored. In this paper, we introduce \\textbf{D-JEPA$\\cdot$T2I}, an\nautoregressive model based on continuous tokens that incorporates innovations\nin both architecture and training strategy to generate high-quality,\nphotorealistic images at arbitrary resolutions, up to 4K. Architecturally, we\nadopt the denoising joint embedding predictive architecture (D-JEPA) while\nleveraging a multimodal visual transformer to effectively integrate textual and\nvisual features. Additionally, we introduce flow matching loss alongside the\nproposed Visual Rotary Positional Embedding (VoPE) to enable continuous\nresolution learning. In terms of training strategy, we propose a data feedback\nmechanism that dynamically adjusts the sampling procedure based on statistical\nanalysis and an online learning critic model. This encourages the model to move\nbeyond its comfort zone, reducing redundant training on well-mastered scenarios\nand compelling it to address more challenging cases with suboptimal generation\nquality. For the first time, we achieve state-of-the-art high-resolution image\nsynthesis via next-token prediction.\n","authors":["Dengsheng Chen","Jie Hu","Tiezhu Yue","Xiaoming Wei","Enhua Wu"],"pdf_url":"https://arxiv.org/pdf/2411.14808v2.pdf","comment":"31 pages"},{"id":"http://arxiv.org/abs/2406.11909v4","updated":"2025-03-02T08:40:16Z","published":"2024-06-16T14:19:49Z","title":"Mixture-of-Subspaces in Low-Rank Adaptation","summary":" In this paper, we introduce a subspace-inspired Low-Rank Adaptation (LoRA)\nmethod, which is computationally efficient, easy to implement, and readily\napplicable to large language, multimodal, and diffusion models. Initially, we\nequivalently decompose the weights of LoRA into two subspaces, and find that\nsimply mixing them can enhance performance. To study such a phenomenon, we\nrevisit it through a fine-grained subspace lens, showing that such modification\nis equivalent to employing a fixed mixer to fuse the subspaces. To be more\nflexible, we jointly learn the mixer with the original LoRA weights, and term\nthe method Mixture-of-Subspaces LoRA (MoSLoRA). MoSLoRA consistently\noutperforms LoRA on tasks in different modalities, including commonsense\nreasoning, visual instruction tuning, and subject-driven text-to-image\ngeneration, demonstrating its effectiveness and robustness. Codes are available\nat https://github.com/wutaiqiang/MoSLoRA.\n","authors":["Taiqiang Wu","Jiahao Wang","Zhe Zhao","Ngai Wong"],"pdf_url":"https://arxiv.org/pdf/2406.11909v4.pdf","comment":"EMNLP 2024 Main, Oral"},{"id":"http://arxiv.org/abs/2408.02976v3","updated":"2025-03-02T08:30:58Z","published":"2024-08-06T06:16:00Z","title":"Empathy Level Alignment via Reinforcement Learning for Empathetic\n Response Generation","summary":" Empathetic response generation, aiming to understand the user's situation and\nfeelings and respond empathically, is crucial in building human-like dialogue\nsystems. Traditional approaches typically employ maximum likelihood estimation\nas the optimization objective during training, yet fail to align the empathy\nlevels between generated and target responses. To this end, we propose an\nempathetic response generation framework using reinforcement learning (EmpRL).\nThe framework develops an effective empathy reward function and generates\nempathetic responses by maximizing the expected reward through reinforcement\nlearning. EmpRL utilizes the pre-trained T5 model as the generator and further\nfine-tunes it to initialize the policy. To align the empathy levels between\ngenerated and target responses within a given context, an empathy reward\nfunction containing three empathy communication mechanisms -- emotional\nreaction, interpretation, and exploration -- is constructed using pre-designed\nand pre-trained empathy identifiers. During reinforcement learning training,\nthe proximal policy optimization algorithm is used to fine-tune the policy,\nenabling the generation of empathetic responses. Both automatic and human\nevaluations demonstrate that the proposed EmpRL framework significantly\nimproves the quality of generated responses, enhances the similarity in empathy\nlevels between generated and target responses, and produces empathetic\nresponses covering both affective and cognitive aspects.\n","authors":["Hui Ma","Bo Zhang","Bo Xu","Jian Wang","Hongfei Lin","Xiao Sun"],"pdf_url":"https://arxiv.org/pdf/2408.02976v3.pdf","comment":"Accepted by IEEE Transactions on Affective Computing"},{"id":"http://arxiv.org/abs/2407.04285v4","updated":"2025-03-02T08:28:00Z","published":"2024-07-05T06:34:32Z","title":"Tackling Data Corruption in Offline Reinforcement Learning via Sequence\n Modeling","summary":" Learning policy from offline datasets through offline reinforcement learning\n(RL) holds promise for scaling data-driven decision-making while avoiding\nunsafe and costly online interactions. However, real-world data collected from\nsensors or humans often contains noise and errors, posing a significant\nchallenge for existing offline RL methods, particularly when the real-world\ndata is limited. Our study reveals that prior research focusing on adapting\npredominant offline RL methods based on temporal difference learning still\nfalls short under data corruption when the dataset is limited. In contrast, we\ndiscover that vanilla sequence modeling methods, such as Decision Transformer,\nexhibit robustness against data corruption, even without specialized\nmodifications. To unlock the full potential of sequence modeling, we propose\nRobust Decision Rransformer (RDT) by incorporating three simple yet effective\nrobust techniques: embedding dropout to improve the model's robustness against\nerroneous inputs, Gaussian weighted learning to mitigate the effects of\ncorrupted labels, and iterative data correction to eliminate corrupted data\nfrom the source. Extensive experiments on MuJoCo, Kitchen, and Adroit tasks\ndemonstrate RDT's superior performance under various data corruption scenarios\ncompared to prior methods. Furthermore, RDT exhibits remarkable robustness in a\nmore challenging setting that combines training-time data corruption with\ntest-time observation perturbations. These results highlight the potential of\nsequence modeling for learning from noisy or corrupted offline datasets,\nthereby promoting the reliable application of offline RL in real-world\nscenarios. Our code is available at\nhttps://github.com/jiawei415/RobustDecisionTransformer.\n","authors":["Jiawei Xu","Rui Yang","Shuang Qiu","Feng Luo","Meng Fang","Baoxiang Wang","Lei Han"],"pdf_url":"https://arxiv.org/pdf/2407.04285v4.pdf","comment":"Accepted by ICLR2025"},{"id":"http://arxiv.org/abs/2407.00886v3","updated":"2025-03-02T08:26:23Z","published":"2024-07-01T01:12:20Z","title":"Efficient Automated Circuit Discovery in Transformers using Contextual\n Decomposition","summary":" Automated mechanistic interpretation research has attracted great interest\ndue to its potential to scale explanations of neural network internals to large\nmodels. Existing automated circuit discovery work relies on activation patching\nor its approximations to identify subgraphs in models for specific tasks\n(circuits). They often suffer from slow runtime, approximation errors, and\nspecific requirements of metrics, such as non-zero gradients. In this work, we\nintroduce contextual decomposition for transformers (CD-T) to build\ninterpretable circuits in large language models. CD-T can produce circuits of\narbitrary level of abstraction, and is the first able to produce circuits as\nfine-grained as attention heads at specific sequence positions efficiently.\nCD-T consists of a set of mathematical equations to isolate contribution of\nmodel features. Through recursively computing contribution of all nodes in a\ncomputational graph of a model using CD-T followed by pruning, we are able to\nreduce circuit discovery runtime from hours to seconds compared to\nstate-of-the-art baselines. On three standard circuit evaluation datasets\n(indirect object identification, greater-than comparisons, and docstring\ncompletion), we demonstrate that CD-T outperforms ACDC and EAP by better\nrecovering the manual circuits with an average of 97% ROC AUC under low\nruntimes. In addition, we provide evidence that faithfulness of CD-T circuits\nis not due to random chance by showing our circuits are 80% more faithful than\nrandom circuits of up to 60% of the original model size. Finally, we show CD-T\ncircuits are able to perfectly replicate original models' behavior\n(faithfulness $ = 1$) using fewer nodes than the baselines for all tasks. Our\nresults underscore the great promise of CD-T for efficient automated\nmechanistic interpretability, paving the way for new insights into the workings\nof large language models.\n","authors":["Aliyah R. Hsu","Georgia Zhou","Yeshwanth Cherapanamjeri","Yaxuan Huang","Anobel Y. Odisho","Peter R. Carroll","Bin Yu"],"pdf_url":"https://arxiv.org/pdf/2407.00886v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.10177v2","updated":"2025-03-02T08:14:27Z","published":"2025-02-14T14:12:09Z","title":"STMA: A Spatio-Temporal Memory Agent for Long-Horizon Embodied Task\n Planning","summary":" A key objective of embodied intelligence is enabling agents to perform\nlong-horizon tasks in dynamic environments while maintaining robust\ndecision-making and adaptability. To achieve this goal, we propose the\nSpatio-Temporal Memory Agent (STMA), a novel framework designed to enhance task\nplanning and execution by integrating spatio-temporal memory. STMA is built\nupon three critical components: (1) a spatio-temporal memory module that\ncaptures historical and environmental changes in real time, (2) a dynamic\nknowledge graph that facilitates adaptive spatial reasoning, and (3) a\nplanner-critic mechanism that iteratively refines task strategies. We evaluate\nSTMA in the TextWorld environment on 32 tasks, involving multi-step planning\nand exploration under varying levels of complexity. Experimental results\ndemonstrate that STMA achieves a 31.25% improvement in success rate and a 24.7%\nincrease in average score compared to the state-of-the-art model. The results\nhighlight the effectiveness of spatio-temporal memory in advancing the memory\ncapabilities of embodied agents.\n","authors":["Mingcong Lei","Yiming Zhao","Ge Wang","Zhixin Mai","Shuguang Cui","Yatong Han","Jinke Ren"],"pdf_url":"https://arxiv.org/pdf/2502.10177v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07533v3","updated":"2025-03-02T07:59:30Z","published":"2024-04-11T07:54:14Z","title":"Exploring the Decentraland Economy: Multifaceted Parcel Attributes, Key\n Insights, and Benchmarking","summary":" This paper presents a comprehensive Decentraland parcels dataset, called\nIITP-VDLand, sourced from diverse platforms such as Decentraland, OpenSea,\nEtherscan, Google BigQuery, and various Social Media Platforms. Unlike existing\ndatasets which have limited attributes and records, IITP-VDLand offers a rich\narray of attributes, encompassing parcel characteristics, trading history, past\nactivities, transactions, and social media interactions. Alongside, we\nintroduce a key attribute in the dataset, namely Rarity score, which measures\nthe uniqueness of each parcel within the virtual world. Addressing the\nsignificant challenge posed by the dispersed nature of this data across various\nsources, we employ a systematic approach, utilizing both available APIs and\ncustom scripts, to gather it. Subsequently, we meticulously curate and organize\nthe information into four distinct fragments: (1) Characteristics, (2) OpenSea\nTrading History, (3) Ethereum Activity Transactions, and (4) Social Media. We\nenvisage that this dataset would serve as a robust resource for training\nmachine- and deep-learning models specifically designed to address real-world\nchallenges within the domain of Decentraland parcels. The performance\nbenchmarking of more than 20 state-of-the-art price prediction models on our\ndataset yields promising results, achieving a maximum R2 score of 0.8251 and an\naccuracy of 74.23% in case of Extra Trees Regressor and Classifier. The key\nfindings reveal that the ensemble models perform better than both deep learning\nand linear models for our dataset. We observe a significant impact of\ncoordinates, geographical proximity, rarity score, and few other economic\nindicators on the prediction of parcel prices.\n","authors":["Dipika Jha","Ankit K. Bhagat","Raju Halder","Rajendra N. Paramanik","Chandra M. Kumar"],"pdf_url":"https://arxiv.org/pdf/2404.07533v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.03355v3","updated":"2025-03-02T07:45:09Z","published":"2024-10-04T12:21:03Z","title":"LANTERN: Accelerating Visual Autoregressive Models with Relaxed\n Speculative Decoding","summary":" Auto-Regressive (AR) models have recently gained prominence in image\ngeneration, often matching or even surpassing the performance of diffusion\nmodels. However, one major limitation of AR models is their sequential nature,\nwhich processes tokens one at a time, slowing down generation compared to\nmodels like GANs or diffusion-based methods that operate more efficiently.\nWhile speculative decoding has proven effective for accelerating LLMs by\ngenerating multiple tokens in a single forward, its application in visual AR\nmodels remains largely unexplored. In this work, we identify a challenge in\nthis setting, which we term \\textit{token selection ambiguity}, wherein visual\nAR models frequently assign uniformly low probabilities to tokens, hampering\nthe performance of speculative decoding. To overcome this challenge, we propose\na relaxed acceptance condition referred to as LANTERN that leverages the\ninterchangeability of tokens in latent space. This relaxation restores the\neffectiveness of speculative decoding in visual AR models by enabling more\nflexible use of candidate tokens that would otherwise be prematurely rejected.\nFurthermore, by incorporating a total variation distance bound, we ensure that\nthese speed gains are achieved without significantly compromising image quality\nor semantic coherence. Experimental results demonstrate the efficacy of our\nmethod in providing a substantial speed-up over speculative decoding. In\nspecific, compared to a na\\\"ive application of the state-of-the-art speculative\ndecoding, LANTERN increases speed-ups by $\\mathbf{1.75}\\times$ and\n$\\mathbf{1.82}\\times$, as compared to greedy decoding and random sampling,\nrespectively, when applied to LlamaGen, a contemporary visual AR model. The\ncode is publicly available at https://github.com/jadohu/LANTERN.\n","authors":["Doohyuk Jang","Sihwan Park","June Yong Yang","Yeonsung Jung","Jihun Yun","Souvik Kundu","Sung-Yub Kim","Eunho Yang"],"pdf_url":"https://arxiv.org/pdf/2410.03355v3.pdf","comment":"30 pages, 13 figures, Accepted to ICLR 2025 (poster)"},{"id":"http://arxiv.org/abs/2410.13757v2","updated":"2025-03-02T07:34:35Z","published":"2024-10-17T16:53:50Z","title":"MobA: Multifaceted Memory-Enhanced Adaptive Planning for Efficient\n Mobile Task Automation","summary":" Existing Multimodal Large Language Model (MLLM)-based agents face significant\nchallenges in handling complex GUI (Graphical User Interface) interactions on\ndevices. These challenges arise from the dynamic and structured nature of GUI\nenvironments, which integrate text, images, and spatial relationships, as well\nas the variability in action spaces across different pages and tasks. To\naddress these limitations, we propose MobA, a novel MLLM-based mobile assistant\nsystem. MobA introduces an adaptive planning module that incorporates a\nreflection mechanism for error recovery and dynamically adjusts plans to align\nwith the real environment contexts and action module's execution capacity.\nAdditionally, a multifaceted memory module provides comprehensive memory\nsupport to enhance adaptability and efficiency. We also present MobBench, a\ndataset designed for complex mobile interactions. Experimental results on\nMobBench and AndroidArena demonstrate MobA's ability to handle dynamic GUI\nenvironments and perform complex mobile task.\n","authors":["Zichen Zhu","Hao Tang","Yansi Li","Dingye Liu","Hongshen Xu","Kunyao Lan","Danyang Zhang","Yixuan Jiang","Hao Zhou","Chenrun Wang","Situo Zhang","Liangtai Sun","Yixiao Wang","Yuheng Sun","Lu Chen","Kai Yu"],"pdf_url":"https://arxiv.org/pdf/2410.13757v2.pdf","comment":"NAACL 2025 Demo Track"},{"id":"http://arxiv.org/abs/2501.19069v2","updated":"2025-03-02T07:22:57Z","published":"2025-01-31T11:55:17Z","title":"Improving vision-language alignment with graph spiking hybrid Networks","summary":" To bridge the semantic gap between vision and language (VL), it is necessary\nto develop a good alignment strategy, which includes handling semantic\ndiversity, abstract representation of visual information, and generalization\nability of models. Recent works use detector-based bounding boxes or patches\nwith regular partitions to represent visual semantics. While current paradigms\nhave made strides, they are still insufficient for fully capturing the nuanced\ncontextual relations among various objects. This paper proposes a comprehensive\nvisual semantic representation module, necessitating the utilization of\npanoptic segmentation to generate coherent fine-grained semantic features.\nFurthermore, we propose a novel Graph Spiking Hybrid Network (GSHN) that\nintegrates the complementary advantages of Spiking Neural Networks (SNNs) and\nGraph Attention Networks (GATs) to encode visual semantic information.\nIntriguingly, the model not only encodes the discrete and continuous latent\nvariables of instances but also adeptly captures both local and global\ncontextual features, thereby significantly enhancing the richness and diversity\nof semantic representations. Leveraging the spatiotemporal properties inherent\nin SNNs, we employ contrastive learning (CL) to enhance the similarity-based\nrepresentation of embeddings. This strategy alleviates the computational\noverhead of the model and enriches meaningful visual representations by\nconstructing positive and negative sample pairs. We design an innovative\npre-training method, Spiked Text Learning (STL), which uses text features to\nimprove the encoding ability of discrete semantics. Experiments show that the\nproposed GSHN exhibits promising results on multiple VL downstream tasks.\n","authors":["Siyu Zhang","Wenzhe Liu","Yeming Chen","Yiming Wu","Heming Zheng","Cheng Cheng"],"pdf_url":"https://arxiv.org/pdf/2501.19069v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14294v3","updated":"2025-03-02T06:49:21Z","published":"2025-01-24T07:24:23Z","title":"Examining Alignment of Large Language Models through Representative\n Heuristics: The Case of Political Stereotypes","summary":" Examining the alignment of large language models (LLMs) has become\nincreasingly important, e.g., when LLMs fail to operate as intended. This study\nexamines the alignment of LLMs with human values for the domain of politics.\nPrior research has shown that LLM-generated outputs can include political\nleanings and mimic the stances of political parties on various issues. However,\nthe extent and conditions under which LLMs deviate from empirical positions are\ninsufficiently examined. To address this gap, we analyze the factors that\ncontribute to LLMs' deviations from empirical positions on political issues,\naiming to quantify these deviations and identify the conditions that cause\nthem.\n Drawing on findings from cognitive science about representativeness\nheuristics, i.e., situations where humans lean on representative attributes of\na target group in a way that leads to exaggerated beliefs, we scrutinize LLM\nresponses through this heuristics' lens. We conduct experiments to determine\nhow LLMs inflate predictions about political parties, which results in\nstereotyping. We find that while LLMs can mimic certain political parties'\npositions, they often exaggerate these positions more than human survey\nrespondents do. Also, LLMs tend to overemphasize representativeness more than\nhumans. This study highlights the susceptibility of LLMs to representativeness\nheuristics, suggesting a potential vulnerability of LLMs that facilitates\npolitical stereotyping. We also test prompt-based mitigation strategies,\nfinding that strategies that can mitigate representative heuristics in humans\nare also effective in reducing the influence of representativeness on\nLLM-generated responses.\n","authors":["Sullam Jeoung","Yubin Ge","Haohan Wang","Jana Diesner"],"pdf_url":"https://arxiv.org/pdf/2501.14294v3.pdf","comment":"ICLR 2025"},{"id":"http://arxiv.org/abs/2501.03836v3","updated":"2025-03-02T06:41:56Z","published":"2025-01-07T14:45:39Z","title":"SCC-YOLO: An Improved Object Detector for Assisting in Brain Tumor\n Diagnosis","summary":" Brain tumors can lead to neurological dysfunction, cognitive and\npsychological changes, increased intracranial pressure, and seizures, posing\nsignificant risks to health. The You Only Look Once (YOLO) series has shown\nsuperior accuracy in medical imaging object detection. This paper presents a\nnovel SCC-YOLO architecture that integrates the SCConv module into YOLOv9. The\nSCConv module optimizes convolutional efficiency by reducing spatial and\nchannel redundancy, enhancing image feature learning. We examine the effects of\ndifferent attention mechanisms with YOLOv9 for brain tumor detection using the\nBr35H dataset and our custom dataset (Brain_Tumor_Dataset). Results indicate\nthat SCC-YOLO improved mAP50 by 0.3% on the Br35H dataset and by 0.5% on our\ncustom dataset compared to YOLOv9. SCC-YOLO achieves state-of-the-art\nperformance in brain tumor detection.\n","authors":["Runci Bai","Guibao Xu","Yanze Shi"],"pdf_url":"https://arxiv.org/pdf/2501.03836v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.12085v2","updated":"2025-03-02T06:29:15Z","published":"2024-10-15T22:06:30Z","title":"Data-adaptive Differentially Private Prompt Synthesis for In-Context\n Learning","summary":" Large Language Models (LLMs) rely on the contextual information embedded in\nexamples/demonstrations to perform in-context learning (ICL). To mitigate the\nrisk of LLMs potentially leaking private information contained in examples in\nthe prompt, we introduce a novel data-adaptive differentially private algorithm\ncalled AdaDPSyn to generate synthetic examples from the private dataset and\nthen use these synthetic examples to perform ICL. The objective of AdaDPSyn is\nto adaptively adjust the noise level in the data synthesis mechanism according\nto the inherent statistical properties of the data, thereby preserving high ICL\naccuracy while maintaining formal differential privacy guarantees. A key\ninnovation in AdaDPSyn is the Precision-Focused Iterative Radius Reduction\ntechnique, which dynamically refines the aggregation radius - the scope of data\ngrouping for noise addition - based on patterns observed in data clustering,\nthereby minimizing the amount of additive noise. We conduct extensive\nexperiments on standard benchmarks and compare AdaDPSyn with DP few-shot\ngeneration algorithm (Tang et al., 2023). The experiments demonstrate that\nAdaDPSyn not only outperforms DP few-shot generation, but also maintains high\naccuracy levels close to those of non-private baselines, providing an effective\nsolution for ICL with privacy protection.\n","authors":["Fengyu Gao","Ruida Zhou","Tianhao Wang","Cong Shen","Jing Yang"],"pdf_url":"https://arxiv.org/pdf/2410.12085v2.pdf","comment":"Accepted to ICLR 2025"},{"id":"http://arxiv.org/abs/2407.01902v2","updated":"2025-03-02T06:28:59Z","published":"2024-07-02T02:58:29Z","title":"SeqAR: Jailbreak LLMs with Sequential Auto-Generated Characters","summary":" The widespread applications of large language models (LLMs) have brought\nabout concerns regarding their potential misuse. Although aligned with human\npreference data before release, LLMs remain vulnerable to various malicious\nattacks. In this paper, we adopt a red-teaming strategy to enhance LLM safety\nand introduce SeqAR, a simple yet effective framework to design jailbreak\nprompts automatically. The SeqAR framework generates and optimizes multiple\njailbreak characters and then applies sequential jailbreak characters in a\nsingle query to bypass the guardrails of the target LLM. Different from\nprevious work which relies on proprietary LLMs or seed jailbreak templates\ncrafted by human expertise, SeqAR can generate and optimize the jailbreak\nprompt in a cold-start scenario using open-sourced LLMs without any seed\njailbreak templates. Experimental results show that SeqAR achieves attack\nsuccess rates of 88% and 60% in bypassing the safety alignment of GPT-3.5-1106\nand GPT-4, respectively. Furthermore, we extensively evaluate the\ntransferability of the generated templates across different LLMs and held-out\nmalicious requests, while also exploring defense strategies against the\njailbreak attack designed by SeqAR.\n","authors":["Yan Yang","Zeguan Xiao","Xin Lu","Hongru Wang","Xuetao Wei","Hailiang Huang","Guanhua Chen","Yun Chen"],"pdf_url":"https://arxiv.org/pdf/2407.01902v2.pdf","comment":"Accepted by NAACL 2025"},{"id":"http://arxiv.org/abs/2410.07672v2","updated":"2025-03-02T06:25:14Z","published":"2024-10-10T07:29:35Z","title":"MACPO: Weak-to-Strong Alignment via Multi-Agent Contrastive Preference\n Optimization","summary":" As large language models (LLMs) are rapidly advancing and achieving\nnear-human capabilities on specific tasks, aligning them with human values is\nbecoming more urgent. In scenarios where LLMs outperform humans, we face a\nweak-to-strong alignment problem where we need to effectively align strong\nstudent LLMs through weak supervision generated by weak teachers. Existing\nalignment methods mainly focus on strong-to-weak alignment and self-alignment\nsettings, and it is impractical to adapt them to the much harder weak-to-strong\nalignment setting. To fill this gap, we propose a multi-agent contrastive\npreference optimization (MACPO) framework. MACPO facilitates weak teachers and\nstrong students to learn from each other by iteratively reinforcing unfamiliar\npositive behaviors while penalizing familiar negative ones. To get this, we\ndevise a mutual positive behavior augmentation strategy to encourage weak\nteachers and strong students to learn from each other's positive behavior and\nfurther provide higher quality positive behavior for the next iteration.\nAdditionally, we propose a hard negative behavior construction strategy to\ninduce weak teachers and strong students to generate familiar negative behavior\nby fine-tuning on negative behavioral data. Experimental results on the HH-RLHF\nand PKU-SafeRLHF datasets, evaluated using both automatic metrics and human\njudgments, demonstrate that MACPO simultaneously improves the alignment\nperformance of strong students and weak teachers. Moreover, as the number of\nweak teachers increases, MACPO achieves better weak-to-strong alignment\nperformance through more iteration optimization rounds.\n","authors":["Yougang Lyu","Lingyong Yan","Zihan Wang","Dawei Yin","Pengjie Ren","Maarten de Rijke","Zhaochun Ren"],"pdf_url":"https://arxiv.org/pdf/2410.07672v2.pdf","comment":"ICLR 2025"},{"id":"http://arxiv.org/abs/2502.19260v2","updated":"2025-03-02T06:08:34Z","published":"2025-02-26T16:06:35Z","title":"EMT: A Visual Multi-Task Benchmark Dataset for Autonomous Driving in the\n Arab Gulf Region","summary":" This paper introduces the Emirates Multi-Task (EMT) dataset - the first\npublicly available dataset for autonomous driving collected in the Arab Gulf\nregion. The EMT dataset captures the unique road topology, high traffic\ncongestion, and distinctive characteristics of the Gulf region, including\nvariations in pedestrian clothing and weather conditions. It contains over\n30,000 frames from a dash-camera perspective, along with 570,000 annotated\nbounding boxes, covering approximately 150 kilometers of driving routes. The\nEMT dataset supports three primary tasks: tracking, trajectory forecasting and\nintention prediction. Each benchmark dataset is complemented with corresponding\nevaluations: (1) multi-agent tracking experiments, focusing on multi-class\nscenarios and occlusion handling; (2) trajectory forecasting evaluation using\ndeep sequential and interaction-aware models; and (3) intention benchmark\nexperiments conducted for predicting agents intentions from observed\ntrajectories. The dataset is publicly available at avlab.io/emt-dataset, and\npre-processing scripts along with evaluation models can be accessed at\ngithub.com/AV-Lab/emt-dataset.\n","authors":["Nadya Abdel Madjid","Murad Mebrahtu","Abdelmoamen Nasser","Bilal Hassan","Naoufel Werghi","Jorge Dias","Majid Khonji"],"pdf_url":"https://arxiv.org/pdf/2502.19260v2.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2502.17487v2","updated":"2025-03-02T06:05:40Z","published":"2025-02-18T16:49:36Z","title":"User Intent to Use DeepSeek for Healthcare Purposes and their Trust in\n the Large Language Model: Multinational Survey Study","summary":" Large language models (LLMs) increasingly serve as interactive healthcare\nresources, yet user acceptance remains underexplored. This study examines how\nease of use, perceived usefulness, trust, and risk perception interact to shape\nintentions to adopt DeepSeek, an emerging LLM-based platform, for healthcare\npurposes. A cross-sectional survey of 556 participants from India, the United\nKingdom, and the United States was conducted to measure perceptions and usage\npatterns. Structural equation modeling assessed both direct and indirect\neffects, including potential quadratic relationships. Results revealed that\ntrust plays a pivotal mediating role: ease of use exerts a significant indirect\neffect on usage intentions through trust, while perceived usefulness\ncontributes to both trust development and direct adoption. By contrast, risk\nperception negatively affects usage intent, emphasizing the importance of\nrobust data governance and transparency. Notably, significant non-linear paths\nwere observed for ease of use and risk, indicating threshold or plateau\neffects. The measurement model demonstrated strong reliability and validity,\nsupported by high composite reliabilities, average variance extracted, and\ndiscriminant validity measures. These findings extend technology acceptance and\nhealth informatics research by illuminating the multifaceted nature of user\nadoption in sensitive domains. Stakeholders should invest in trust-building\nstrategies, user-centric design, and risk mitigation measures to encourage\nsustained and safe uptake of LLMs in healthcare. Future work can employ\nlongitudinal designs or examine culture-specific variables to further clarify\nhow user perceptions evolve over time and across different regulatory\nenvironments. Such insights are critical for harnessing AI to enhance outcomes.\n","authors":["Avishek Choudhury","Yeganeh Shahsavar","Hamid Shamszare"],"pdf_url":"https://arxiv.org/pdf/2502.17487v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08435v2","updated":"2025-03-02T05:13:28Z","published":"2024-08-15T21:59:23Z","title":"Automated Design of Agentic Systems","summary":" Researchers are investing substantial effort in developing powerful\ngeneral-purpose agents, wherein Foundation Models are used as modules within\nagentic systems (e.g. Chain-of-Thought, Self-Reflection, Toolformer). However,\nthe history of machine learning teaches us that hand-designed solutions are\neventually replaced by learned solutions. We describe a newly forming research\narea, Automated Design of Agentic Systems (ADAS), which aims to automatically\ncreate powerful agentic system designs, including inventing novel building\nblocks and/or combining them in new ways. We further demonstrate that there is\nan unexplored yet promising approach within ADAS where agents can be defined in\ncode and new agents can be automatically discovered by a meta agent programming\never better ones in code. Given that programming languages are Turing Complete,\nthis approach theoretically enables the learning of any possible agentic\nsystem: including novel prompts, tool use, workflows, and combinations thereof.\nWe present a simple yet effective algorithm named Meta Agent Search to\ndemonstrate this idea, where a meta agent iteratively programs interesting new\nagents based on an ever-growing archive of previous discoveries. Through\nextensive experiments across multiple domains including coding, science, and\nmath, we show that our algorithm can progressively invent agents with novel\ndesigns that greatly outperform state-of-the-art hand-designed agents.\nImportantly, we consistently observe the surprising result that agents invented\nby Meta Agent Search maintain superior performance even when transferred across\ndomains and models, demonstrating their robustness and generality. Provided we\ndevelop it safely, our work illustrates the potential of an exciting new\nresearch direction toward automatically designing ever-more powerful agentic\nsystems to benefit humanity.\n","authors":["Shengran Hu","Cong Lu","Jeff Clune"],"pdf_url":"https://arxiv.org/pdf/2408.08435v2.pdf","comment":"Website: https://shengranhu.com/ADAS"},{"id":"http://arxiv.org/abs/2410.21533v2","updated":"2025-03-02T04:39:42Z","published":"2024-10-28T21:02:13Z","title":"L3Ms -- Lagrange Large Language Models","summary":" Supervised fine-tuning (SFT) and alignment of large language models (LLMs)\nare key steps in providing a good user experience. However, the concept of an\nappropriate alignment is inherently application-dependent, and current methods\noften rely on heuristic choices to drive optimization. In this work, we\nformulate SFT and alignment as a constrained optimization problem: the LLM is\nfine-tuned on a task while being required to meet application-specific\nrequirements, without resorting to heuristics. To solve this, we propose\nLagrange Large Language Models (L3Ms), which employ logarithmic barriers to\nenforce the constraints. This approach allows for the customization of L3Ms\nacross diverse applications while avoiding heuristic-driven processes. We\nexperimentally demonstrate the versatility and efficacy of L3Ms in achieving\ntailored alignments for various applications.\n","authors":["Guneet S. Dhillon","Xingjian Shi","Yee Whye Teh","Alex Smola"],"pdf_url":"https://arxiv.org/pdf/2410.21533v2.pdf","comment":"International Conference on Learning Representations (ICLR), 2025"},{"id":"http://arxiv.org/abs/2502.10709v2","updated":"2025-03-02T04:37:08Z","published":"2025-02-15T07:45:20Z","title":"An Empirical Analysis of Uncertainty in Large Language Model Evaluations","summary":" As LLM-as-a-Judge emerges as a new paradigm for assessing large language\nmodels (LLMs), concerns have been raised regarding the alignment, bias, and\nstability of LLM evaluators. While substantial work has focused on alignment\nand bias, little research has concentrated on the stability of LLM evaluators.\nIn this paper, we conduct extensive experiments involving 9 widely used LLM\nevaluators across 2 different evaluation settings to investigate the\nuncertainty in model-based LLM evaluations. We pinpoint that LLM evaluators\nexhibit varying uncertainty based on model families and sizes. With careful\ncomparative analyses, we find that employing special prompting strategies,\nwhether during inference or post-training, can alleviate evaluation uncertainty\nto some extent. By utilizing uncertainty to enhance LLM's reliability and\ndetection capability in Out-Of-Distribution (OOD) data, we further fine-tune an\nuncertainty-aware LLM evaluator named ConfiLM using a human-annotated\nfine-tuning set and assess ConfiLM's OOD evaluation ability on a manually\ndesigned test set sourced from the 2024 Olympics. Experimental results\ndemonstrate that incorporating uncertainty as additional information during the\nfine-tuning phase can largely improve the model's evaluation performance in OOD\nscenarios. The code and data are released at:\nhttps://github.com/hasakiXie123/LLM-Evaluator-Uncertainty.\n","authors":["Qiujie Xie","Qingqiu Li","Zhuohao Yu","Yuejie Zhang","Yue Zhang","Linyi Yang"],"pdf_url":"https://arxiv.org/pdf/2502.10709v2.pdf","comment":"ICLR 2025"},{"id":"http://arxiv.org/abs/2408.08258v3","updated":"2025-03-02T04:25:12Z","published":"2024-08-15T16:59:15Z","title":"Snuffy: Efficient Whole Slide Image Classifier","summary":" Whole Slide Image (WSI) classification with multiple instance learning (MIL)\nin digital pathology faces significant computational challenges. Current\nmethods mostly rely on extensive self-supervised learning (SSL) for\nsatisfactory performance, requiring long training periods and considerable\ncomputational resources. At the same time, no pre-training affects performance\ndue to domain shifts from natural images to WSIs. We introduce Snuffy\narchitecture, a novel MIL-pooling method based on sparse transformers that\nmitigates performance loss with limited pre-training and enables continual\nfew-shot pre-training as a competitive option. Our sparsity pattern is tailored\nfor pathology and is theoretically proven to be a universal approximator with\nthe tightest probabilistic sharp bound on the number of layers for sparse\ntransformers, to date. We demonstrate Snuffy's effectiveness on CAMELYON16 and\nTCGA Lung cancer datasets, achieving superior WSI and patch-level accuracies.\nThe code is available on https://github.com/jafarinia/snuffy.\n","authors":["Hossein Jafarinia","Alireza Alipanah","Danial Hamdi","Saeed Razavi","Nahal Mirzaie","Mohammad Hossein Rohban"],"pdf_url":"https://arxiv.org/pdf/2408.08258v3.pdf","comment":"Accepted for ECCV 2024"},{"id":"http://arxiv.org/abs/2311.15487v5","updated":"2025-03-02T04:21:13Z","published":"2023-11-27T02:12:02Z","title":"Global $\\mathcal{L}^2$ minimization at uniform exponential rate via\n geometrically adapted gradient descent in Deep Learning","summary":" We consider the scenario of supervised learning in Deep Learning (DL)\nnetworks, and exploit the arbitrariness of choice in the Riemannian metric\nrelative to which the gradient descent flow can be defined (a general fact of\ndifferential geometry). In the standard approach to DL, the gradient flow on\nthe space of parameters (weights and biases) is defined with respect to the\nEuclidean metric. Here instead, we choose the gradient flow with respect to the\nEuclidean metric in the output layer of the DL network. This naturally induces\ntwo modified versions of the gradient descent flow in the parameter space, one\nadapted for the overparametrized setting, and the other for the\nunderparametrized setting. In the overparametrized case, we prove that,\nprovided that a rank condition holds, all orbits of the modified gradient\ndescent drive the ${\\mathcal L}^2$ cost to its global minimum at a uniform\nexponential convergence rate; one thereby obtains an a priori stopping time for\nany prescribed proximity to the global minimum. We point out relations of the\nlatter to sub-Riemannian geometry. Moreover, we generalize the above framework\nto the situation in which the rank condition does not hold; in particular, we\nshow that local equilibria can only exist if a rank loss occurs, and that\ngenerically, they are not isolated points, but elements of a critical\nsubmanifold of parameter space.\n","authors":["Thomas Chen"],"pdf_url":"https://arxiv.org/pdf/2311.15487v5.pdf","comment":"AMS Latex, 20 pages. Typos corrected, references and comments added"},{"id":"http://arxiv.org/abs/2308.11432v7","updated":"2025-03-02T04:04:03Z","published":"2023-08-22T13:30:37Z","title":"A Survey on Large Language Model based Autonomous Agents","summary":" Autonomous agents have long been a prominent research focus in both academic\nand industry communities. Previous research in this field often focuses on\ntraining agents with limited knowledge within isolated environments, which\ndiverges significantly from human learning processes, and thus makes the agents\nhard to achieve human-like decisions. Recently, through the acquisition of vast\namounts of web knowledge, large language models (LLMs) have demonstrated\nremarkable potential in achieving human-level intelligence. This has sparked an\nupsurge in studies investigating LLM-based autonomous agents. In this paper, we\npresent a comprehensive survey of these studies, delivering a systematic review\nof the field of LLM-based autonomous agents from a holistic perspective. More\nspecifically, we first discuss the construction of LLM-based autonomous agents,\nfor which we propose a unified framework that encompasses a majority of the\nprevious work. Then, we present a comprehensive overview of the diverse\napplications of LLM-based autonomous agents in the fields of social science,\nnatural science, and engineering. Finally, we delve into the evaluation\nstrategies commonly used for LLM-based autonomous agents. Based on the previous\nstudies, we also present several challenges and future directions in this\nfield. To keep track of this field and continuously update our survey, we\nmaintain a repository of relevant references at\nhttps://github.com/Paitesanshi/LLM-Agent-Survey.\n","authors":["Lei Wang","Chen Ma","Xueyang Feng","Zeyu Zhang","Hao Yang","Jingsen Zhang","Zhiyuan Chen","Jiakai Tang","Xu Chen","Yankai Lin","Wayne Xin Zhao","Zhewei Wei","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.11432v7.pdf","comment":"Correcting several typos, 35 pages, 5 figures, 3 tables"},{"id":"http://arxiv.org/abs/2407.14985v5","updated":"2025-03-02T03:27:58Z","published":"2024-07-20T21:24:40Z","title":"Generalization v.s. Memorization: Tracing Language Models' Capabilities\n Back to Pretraining Data","summary":" The impressive capabilities of large language models (LLMs) have sparked\ndebate over whether these models genuinely generalize to unseen tasks or\npredominantly rely on memorizing vast amounts of pretraining data. To explore\nthis issue, we introduce an extended concept of memorization, distributional\nmemorization, which measures the correlation between the LLM output\nprobabilities and the pretraining data frequency. To effectively capture\ntask-specific pretraining data frequency, we propose a novel task-gram language\nmodel, which is built by counting the co-occurrence of semantically related\n$n$-gram pairs from task inputs and outputs in the pretraining corpus. Using\nthe Pythia models trained on the Pile dataset, we evaluate four distinct tasks:\nmachine translation, factual question answering, world knowledge understanding,\nand math reasoning. Our findings reveal varying levels of memorization, with\nthe strongest effect observed in factual question answering. Furthermore, while\nmodel performance improves across all tasks as LLM size increases, only factual\nquestion answering shows an increase in memorization, whereas machine\ntranslation and reasoning tasks exhibit greater generalization, producing more\nnovel outputs. This study demonstrates that memorization plays a larger role in\nsimpler, knowledge-intensive tasks, while generalization is the key for harder,\nreasoning-based tasks, providing a scalable method for analyzing large\npretraining corpora in greater depth.\n","authors":["Xinyi Wang","Antonis Antoniades","Yanai Elazar","Alfonso Amayuelas","Alon Albalak","Kexun Zhang","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2407.14985v5.pdf","comment":"Accepted to ICLR 2025"},{"id":"http://arxiv.org/abs/2412.07236v4","updated":"2025-03-02T03:13:54Z","published":"2024-12-10T06:56:36Z","title":"CBraMod: A Criss-Cross Brain Foundation Model for EEG Decoding","summary":" Electroencephalography (EEG) is a non-invasive technique to measure and\nrecord brain electrical activity, widely used in various BCI and healthcare\napplications. Early EEG decoding methods rely on supervised learning, limited\nby specific tasks and datasets, hindering model performance and\ngeneralizability. With the success of large language models, there is a growing\nbody of studies focusing on EEG foundation models. However, these studies still\nleave challenges: Firstly, most of existing EEG foundation models employ full\nEEG modeling strategy. It models the spatial and temporal dependencies between\nall EEG patches together, but ignores that the spatial and temporal\ndependencies are heterogeneous due to the unique structural characteristics of\nEEG signals. Secondly, existing EEG foundation models have limited\ngeneralizability on a wide range of downstream BCI tasks due to varying formats\nof EEG data, making it challenging to adapt to. To address these challenges, we\npropose a novel foundation model called CBraMod. Specifically, we devise a\ncriss-cross transformer as the backbone to thoroughly leverage the structural\ncharacteristics of EEG signals, which can model spatial and temporal\ndependencies separately through two parallel attention mechanisms. And we\nutilize an asymmetric conditional positional encoding scheme which can encode\npositional information of EEG patches and be easily adapted to the EEG with\ndiverse formats. CBraMod is pre-trained on a very large corpus of EEG through\npatch-based masked EEG reconstruction. We evaluate CBraMod on up to 10\ndownstream BCI tasks (12 public datasets). CBraMod achieves the\nstate-of-the-art performance across the wide range of tasks, proving its strong\ncapability and generalizability. The source code is publicly available at\nhttps://github.com/wjq-learning/CBraMod.\n","authors":["Jiquan Wang","Sha Zhao","Zhiling Luo","Yangxuan Zhou","Haiteng Jiang","Shijian Li","Tao Li","Gang Pan"],"pdf_url":"https://arxiv.org/pdf/2412.07236v4.pdf","comment":"Accepted by The Thirteenth International Conference on Learning\n Representations (ICLR 2025)"},{"id":"http://arxiv.org/abs/2501.14216v2","updated":"2025-03-02T03:00:53Z","published":"2025-01-24T03:44:16Z","title":"TFG-Flow: Training-free Guidance in Multimodal Generative Flow","summary":" Given an unconditional generative model and a predictor for a target property\n(e.g., a classifier), the goal of training-free guidance is to generate samples\nwith desirable target properties without additional training. As a highly\nefficient technique for steering generative models toward flexible outcomes,\ntraining-free guidance has gained increasing attention in diffusion models.\nHowever, existing methods only handle data in continuous spaces, while many\nscientific applications involve both continuous and discrete data (referred to\nas multimodality). Another emerging trend is the growing use of the simple and\ngeneral flow matching framework in building generative foundation models, where\nguided generation remains under-explored. To address this, we introduce\nTFG-Flow, a novel training-free guidance method for multimodal generative flow.\nTFG-Flow addresses the curse-of-dimensionality while maintaining the property\nof unbiased sampling in guiding discrete variables. We validate TFG-Flow on\nfour molecular design tasks and show that TFG-Flow has great potential in drug\ndesign by generating molecules with desired properties.\n","authors":["Haowei Lin","Shanda Li","Haotian Ye","Yiming Yang","Stefano Ermon","Yitao Liang","Jianzhu Ma"],"pdf_url":"https://arxiv.org/pdf/2501.14216v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03311v3","updated":"2025-03-02T02:45:57Z","published":"2024-07-03T17:54:11Z","title":"Efficient Imitation Without Demonstrations via Value-Penalized Auxiliary\n Control from Examples","summary":" Common approaches to providing feedback in reinforcement learning are the use\nof hand-crafted rewards or full-trajectory expert demonstrations.\nAlternatively, one can use examples of completed tasks, but such an approach\ncan be extremely sample inefficient. We introduce value-penalized auxiliary\ncontrol from examples (VPACE), an algorithm that significantly improves\nexploration in example-based control by adding examples of simple auxiliary\ntasks and an above-success-level value penalty. Across both simulated and real\nrobotic environments, we show that our approach substantially improves learning\nefficiency for challenging tasks, while maintaining bounded value estimates.\nPreliminary results also suggest that VPACE may learn more efficiently than the\nmore common approaches of using full trajectories or true sparse rewards.\nProject site: https://papers.starslab.ca/vpace/ .\n","authors":["Trevor Ablett","Bryan Chan","Jayce Haoran Wang","Jonathan Kelly"],"pdf_url":"https://arxiv.org/pdf/2407.03311v3.pdf","comment":"Accepted to the IEEE International Conference on Robotics and\n Automation (ICRA'25), Atlanta, USA, May 19-23, 2025"},{"id":"http://arxiv.org/abs/2410.05470v2","updated":"2025-03-02T02:07:21Z","published":"2024-10-07T20:04:29Z","title":"Image Watermarks are Removable Using Controllable Regeneration from\n Clean Noise","summary":" Image watermark techniques provide an effective way to assert ownership,\ndeter misuse, and trace content sources, which has become increasingly\nessential in the era of large generative models. A critical attribute of\nwatermark techniques is their robustness against various manipulations. In this\npaper, we introduce a watermark removal approach capable of effectively\nnullifying state-of-the-art watermarking techniques. Our primary insight\ninvolves regenerating the watermarked image starting from a clean Gaussian\nnoise via a controllable diffusion model, utilizing the extracted semantic and\nspatial features from the watermarked image. The semantic control adapter and\nthe spatial control network are specifically trained to control the denoising\nprocess towards ensuring image quality and enhancing consistency between the\ncleaned image and the original watermarked image. To achieve a smooth trade-off\nbetween watermark removal performance and image consistency, we further propose\nan adjustable and controllable regeneration scheme. This scheme adds varying\nnumbers of noise steps to the latent representation of the watermarked image,\nfollowed by a controlled denoising process starting from this noisy latent\nrepresentation. As the number of noise steps increases, the latent\nrepresentation progressively approaches clean Gaussian noise, facilitating the\ndesired trade-off. We apply our watermark removal methods across various\nwatermarking techniques, and the results demonstrate that our methods offer\nsuperior visual consistency/quality and enhanced watermark removal performance\ncompared to existing regeneration approaches. Our code is available at\nhttps://github.com/yepengliu/CtrlRegen.\n","authors":["Yepeng Liu","Yiren Song","Hai Ci","Yu Zhang","Haofan Wang","Mike Zheng Shou","Yuheng Bu"],"pdf_url":"https://arxiv.org/pdf/2410.05470v2.pdf","comment":"ICLR2025"},{"id":"http://arxiv.org/abs/2406.19653v3","updated":"2025-03-02T01:47:44Z","published":"2024-06-28T04:48:05Z","title":"ACES: Automatic Cohort Extraction System for Event-Stream Datasets","summary":" Reproducibility remains a significant challenge in machine learning (ML) for\nhealthcare. Datasets, model pipelines, and even task or cohort definitions are\noften private in this field, leading to a significant barrier in sharing,\niterating, and understanding ML results on electronic health record (EHR)\ndatasets. We address a significant part of this problem by introducing the\nAutomatic Cohort Extraction System (ACES) for event-stream data. This library\nis designed to simultaneously simplify the development of tasks and cohorts for\nML in healthcare and also enable their reproduction, both at an exact level for\nsingle datasets and at a conceptual level across datasets. To accomplish this,\nACES provides: (1) a highly intuitive and expressive domain-specific\nconfiguration language for defining both dataset-specific concepts and\ndataset-agnostic inclusion or exclusion criteria, and (2) a pipeline to\nautomatically extract patient records that meet these defined criteria from\nreal-world data. ACES can be automatically applied to any dataset in either the\nMedical Event Data Standard (MEDS) or Event Stream GPT (ESGPT) formats, or to\n*any* dataset in which the necessary task-specific predicates can be extracted\nin an event-stream form. ACES has the potential to significantly lower the\nbarrier to entry for defining ML tasks in representation learning, redefine the\nway researchers interact with EHR datasets, and significantly improve the state\nof reproducibility for ML studies using this modality. ACES is available at:\nhttps://github.com/justin13601/aces.\n","authors":["Justin Xu","Jack Gallifant","Alistair E. W. Johnson","Matthew B. A. McDermott"],"pdf_url":"https://arxiv.org/pdf/2406.19653v3.pdf","comment":"[ICLR 2025] For the latest ACES online documentation, please see\n https://eventstreamaces.readthedocs.io/en/latest/"},{"id":"http://arxiv.org/abs/2410.21533v2","updated":"2025-03-02T04:39:42Z","published":"2024-10-28T21:02:13Z","title":"L3Ms - Lagrange Large Language Models","summary":" Supervised fine-tuning (SFT) and alignment of large language models (LLMs)\nare key steps in providing a good user experience. However, the concept of an\nappropriate alignment is inherently application-dependent, and current methods\noften rely on heuristic choices to drive optimization. In this work, we\nformulate SFT and alignment as a constrained optimization problem: the LLM is\nfine-tuned on a task while being required to meet application-specific\nrequirements, without resorting to heuristics. To solve this, we propose\nLagrange Large Language Models (L3Ms), which employ logarithmic barriers to\nenforce the constraints. This approach allows for the customization of L3Ms\nacross diverse applications while avoiding heuristic-driven processes. We\nexperimentally demonstrate the versatility and efficacy of L3Ms in achieving\ntailored alignments for various applications.\n","authors":["Guneet S. Dhillon","Xingjian Shi","Yee Whye Teh","Alex Smola"],"pdf_url":"https://arxiv.org/pdf/2410.21533v2.pdf","comment":"International Conference on Learning Representations (ICLR), 2025"}],"Genomics":[{"id":"http://arxiv.org/abs/2309.13326v3","updated":"2025-03-02T10:25:54Z","published":"2023-09-23T10:10:00Z","title":"SARS-CoV-2 Wastewater Genomic Surveillance: Approaches, Challenges, and\n Opportunities","summary":" During the SARS-CoV-2 pandemic, wastewater-based genomic surveillance (WWGS)\nemerged as an efficient viral surveillance tool that takes into account\nasymptomatic cases and can identify known and novel mutations and offers the\nopportunity to assign known virus lineages based on the detected mutations\nprofiles. WWGS can also hint towards novel or cryptic lineages, but it is\ndifficult to clearly identify and define novel lineages from wastewater (WW)\nalone. While WWGS has significant advantages in monitoring SARS-CoV-2 viral\nspread, technical challenges remain, including poor sequencing coverage and\nquality due to viral RNA degradation. As a result, the viral RNAs in wastewater\nhave low concentrations and are often fragmented, making sequencing difficult.\nWWGS analysis requires advanced computational tools that are yet to be\ndeveloped and benchmarked. The existing bioinformatics tools used to analyze\nwastewater sequencing data are often based on previously developed methods for\nquantifying the expression of transcripts or viral diversity. Those methods\nwere not developed for wastewater sequencing data specifically, and are not\noptimized to address unique challenges associated with wastewater. While\nspecialized tools for analysis of wastewater sequencing data have also been\ndeveloped recently, it remains to be seen how they will perform given the\nongoing evolution of SARS-CoV-2 and the decline in testing and patient-based\ngenomic surveillance. Here, we discuss opportunities and challenges associated\nwith WWGS, including sample preparation, sequencing technology, and\nbioinformatics methods.\n","authors":["Viorel Munteanu","Michael A. Saldana","David Dreifuss","Wenhao O. Ouyang","Jannatul Ferdous","Fatemeh Mohebbi","Jessica Schlueter","Dumitru Ciorba","Viorel Bostan","Victor Gordeev","Justin Maine Su","Nadiia Kasianchuk","Nitesh Kumar Sharma","Sergey Knyazev","Eva Aßmann","Andrei Lobiuc","Mihai Covasa","Keith A. Crandall","Nicholas C. Wu","Christopher E. Mason","Braden T Tierney","Alexander G Lucaci","Roel A. Ophoff","Cynthia Gibas","Piotr Rzymski","Pavel Skums","Helena Solo-Gabriele","Beerenwinkel Niko","Alex Zelikovsky","Martin Hölzer","Adam Smith","Serghei Mangul"],"pdf_url":"https://arxiv.org/pdf/2309.13326v3.pdf","comment":"V Munteanu and M Saldana contributed equally to this work. M\n H\\\"olzer, A Smith and S Mangul jointly supervised this work. For\n correspondence: serghei.mangul@gmail.com"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2408.15998v2","updated":"2025-03-02T23:41:37Z","published":"2024-08-28T17:59:31Z","title":"Eagle: Exploring The Design Space for Multimodal LLMs with Mixture of\n Encoders","summary":" The ability to accurately interpret complex visual information is a crucial\ntopic of multimodal large language models (MLLMs). Recent work indicates that\nenhanced visual perception significantly reduces hallucinations and improves\nperformance on resolution-sensitive tasks, such as optical character\nrecognition and document analysis. A number of recent MLLMs achieve this goal\nusing a mixture of vision encoders. Despite their success, there is a lack of\nsystematic comparisons and detailed ablation studies addressing critical\naspects, such as expert selection and the integration of multiple vision\nexperts. This study provides an extensive exploration of the design space for\nMLLMs using a mixture of vision encoders and resolutions. Our findings reveal\nseveral underlying principles common to various existing strategies, leading to\na streamlined yet effective design approach. We discover that simply\nconcatenating visual tokens from a set of complementary vision encoders is as\neffective as more complex mixing architectures or strategies. We additionally\nintroduce Pre-Alignment to bridge the gap between vision-focused encoders and\nlanguage tokens, enhancing model coherence. The resulting family of MLLMs,\nEagle, surpasses other leading open-source models on major MLLM benchmarks.\n","authors":["Min Shi","Fuxiao Liu","Shihao Wang","Shijia Liao","Subhashree Radhakrishnan","Yilin Zhao","De-An Huang","Hongxu Yin","Karan Sapra","Yaser Yacoob","Humphrey Shi","Bryan Catanzaro","Andrew Tao","Jan Kautz","Zhiding Yu","Guilin Liu"],"pdf_url":"https://arxiv.org/pdf/2408.15998v2.pdf","comment":"Github: https://github.com/NVlabs/Eagle, HuggingFace:\n https://huggingface.co/NVEagle"},{"id":"http://arxiv.org/abs/2411.09851v3","updated":"2025-03-02T23:29:50Z","published":"2024-11-15T00:09:37Z","title":"SymbolFit: Automatic Parametric Modeling with Symbolic Regression","summary":" We introduce SymbolFit, a framework that automates parametric modeling by\nusing symbolic regression to perform a machine-search for functions that fit\nthe data while simultaneously providing uncertainty estimates in a single run.\nTraditionally, constructing a parametric model to accurately describe binned\ndata has been a manual and iterative process, requiring an adequate functional\nform to be determined before the fit can be performed. The main challenge\narises when the appropriate functional forms cannot be derived from first\nprinciples, especially when there is no underlying true closed-form function\nfor the distribution. In this work, we develop a framework that automates and\nstreamlines the process by utilizing symbolic regression, a machine learning\ntechnique that explores a vast space of candidate functions without requiring a\npredefined functional form because the functional form itself is treated as a\ntrainable parameter, making the process far more efficient and effortless than\ntraditional regression methods. We demonstrate the framework in high-energy\nphysics experiments at the CERN Large Hadron Collider (LHC) using five real\nproton-proton collision datasets from new physics searches, including\nbackground modeling in resonance searches for high-mass dijet, trijet,\npaired-dijet, diphoton, and dimuon events. We show that our framework can\nflexibly and efficiently generate a wide range of candidate functions that fit\na nontrivial distribution well using a simple fit configuration that varies\nonly by random seed, and that the same fit configuration, which defines a vast\nfunction space, can also be applied to distributions of different shapes,\nwhereas achieving a comparable result with traditional methods would have\nrequired extensive manual effort.\n","authors":["Ho Fung Tsoi","Dylan Rankin","Cecile Caillol","Miles Cranmer","Sridhara Dasu","Javier Duarte","Philip Harris","Elliot Lipeles","Vladimir Loncar"],"pdf_url":"https://arxiv.org/pdf/2411.09851v3.pdf","comment":"50 pages, 35 figures. Under review. The API can be used\n out-of-the-box and is available at https://github.com/hftsoi/symbolfit"},{"id":"http://arxiv.org/abs/2401.17116v2","updated":"2025-03-02T23:04:57Z","published":"2024-01-30T15:50:06Z","title":"Quantum time dynamics mediated by the Yang-Baxter equation and\n artificial neural networks","summary":" Quantum computing shows great potential, but errors pose a significant\nchallenge. This study explores new strategies for mitigating quantum errors\nusing artificial neural networks (ANN) and the Yang-Baxter equation (YBE).\nUnlike traditional error mitigation methods, which are computationally\nintensive, we investigate artificial error mitigation. We developed a novel\nmethod that combines ANN for noise mitigation combined with the YBE to generate\nnoisy data. This approach effectively reduces noise in quantum simulations,\nenhancing the accuracy of the results. The YBE rigorously preserves quantum\ncorrelations and symmetries in spin chain simulations in certain classes of\nintegrable lattice models, enabling effective compression of quantum circuits\nwhile retaining linear scalability with the number of qubits. This compression\nfacilitates both full and partial implementations, allowing the generation of\nnoisy quantum data on hardware alongside noiseless simulations using classical\nplatforms. By introducing controlled noise through the YBE, we enhance the\ndataset for error mitigation. We train an ANN model on partial data from\nquantum simulations, demonstrating its effectiveness in mitigating errors in\ntime-evolving quantum states, providing a scalable framework to enhance quantum\ncomputation fidelity, particularly in noisy intermediate-scale quantum (NISQ)\nsystems. We demonstrate the efficacy of this approach by performing quantum\ntime dynamics simulations using the Heisenberg XY Hamiltonian on real quantum\ndevices.\n","authors":["Sahil Gulania","Yuri Alexeev","Stephen K. Gray","Bo Peng","Niranjan Govind"],"pdf_url":"https://arxiv.org/pdf/2401.17116v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13496v2","updated":"2025-03-02T22:34:01Z","published":"2024-02-21T03:14:45Z","title":"Heterogeneous Graph Neural Network on Semantic Tree","summary":" The recent past has seen an increasing interest in Heterogeneous Graph Neural\nNetworks (HGNNs), since many real-world graphs are heterogeneous in nature,\nfrom citation graphs to email graphs. However, existing methods ignore a tree\nhierarchy among metapaths, naturally constituted by different node types and\nrelation types. In this paper, we present HetTree, a novel HGNN that models\nboth the graph structure and heterogeneous aspects in a scalable and effective\nmanner. Specifically, HetTree builds a semantic tree data structure to capture\nthe hierarchy among metapaths. To effectively encode the semantic tree, HetTree\nuses a novel subtree attention mechanism to emphasize metapaths that are more\nhelpful in encoding parent-child relationships. Moreover, HetTree proposes\ncarefully matching pre-computed features and labels correspondingly,\nconstituting a complete metapath representation. Our evaluation of HetTree on a\nvariety of real-world datasets demonstrates that it outperforms all existing\nbaselines on open benchmarks and efficiently scales to large real-world graphs\nwith millions of nodes and edges.\n","authors":["Mingyu Guan","Jack W. Stokes","Qinlong Luo","Fuchen Liu","Purvanshi Mehta","Elnaz Nouri","Taesoo Kim"],"pdf_url":"https://arxiv.org/pdf/2402.13496v2.pdf","comment":"Accepted at AAAI 2025"},{"id":"http://arxiv.org/abs/2407.11249v3","updated":"2025-03-02T22:12:01Z","published":"2024-07-15T21:32:58Z","title":"Disentangling Representations through Multi-task Learning","summary":" Intelligent perception and interaction with the world hinges on internal\nrepresentations that capture its underlying structure (''disentangled'' or\n''abstract'' representations). Disentangled representations serve as world\nmodels, isolating latent factors of variation in the world along approximately\northogonal directions, thus facilitating feature-based generalization. We\nprovide experimental and theoretical results guaranteeing the emergence of\ndisentangled representations in agents that optimally solve multi-task evidence\naccumulation classification tasks, canonical in the neuroscience literature.\nThe key conceptual finding is that, by producing accurate multi-task\nclassification estimates, a system implicitly represents a set of coordinates\nspecifying a disentangled representation of the underlying latent state of the\ndata it receives. The theory provides conditions for the emergence of these\nrepresentations in terms of noise, number of tasks, and evidence accumulation\ntime. We experimentally validate these predictions in RNNs trained to\nmulti-task, which learn disentangled representations in the form of continuous\nattractors, leading to zero-shot out-of-distribution (OOD) generalization in\npredicting latent factors. We demonstrate the robustness of our framework\nacross autoregressive architectures, decision boundary geometries and in tasks\nrequiring classification confidence estimation. We find that transformers are\nparticularly suited for disentangling representations, which might explain\ntheir unique world understanding abilities. Overall, our framework establishes\na formal link between competence at multiple tasks and the formation of\ndisentangled, interpretable world models in both biological and artificial\nsystems, and helps explain why ANNs often arrive at human-interpretable\nconcepts, and how they both may acquire exceptional zero-shot generalization\ncapabilities.\n","authors":["Pantelis Vafidis","Aman Bhargava","Antonio Rangel"],"pdf_url":"https://arxiv.org/pdf/2407.11249v3.pdf","comment":"43 pages, 17 figures"},{"id":"http://arxiv.org/abs/2406.10279v3","updated":"2025-03-02T21:03:52Z","published":"2024-06-12T03:29:06Z","title":"We Have a Package for You! A Comprehensive Analysis of Package\n Hallucinations by Code Generating LLMs","summary":" The reliance of popular programming languages such as Python and JavaScript\non centralized package repositories and open-source software, combined with the\nemergence of code-generating Large Language Models (LLMs), has created a new\ntype of threat to the software supply chain: package hallucinations. These\nhallucinations, which arise from fact-conflicting errors when generating code\nusing LLMs, represent a novel form of package confusion attack that poses a\ncritical threat to the integrity of the software supply chain. This paper\nconducts a rigorous and comprehensive evaluation of package hallucinations\nacross different programming languages, settings, and parameters, exploring how\na diverse set of models and configurations affect the likelihood of generating\nerroneous package recommendations and identifying the root causes of this\nphenomenon. Using 16 popular LLMs for code generation and two unique prompt\ndatasets, we generate 576,000 code samples in two programming languages that we\nanalyze for package hallucinations. Our findings reveal that that the average\npercentage of hallucinated packages is at least 5.2% for commercial models and\n21.7% for open-source models, including a staggering 205,474 unique examples of\nhallucinated package names, further underscoring the severity and pervasiveness\nof this threat. To overcome this problem, we implement several hallucination\nmitigation strategies and show that they are able to significantly reduce the\nnumber of package hallucinations while maintaining code quality. Our\nexperiments and findings highlight package hallucinations as a persistent and\nsystemic phenomenon while using state-of-the-art LLMs for code generation, and\na significant challenge which deserves the research community's urgent\nattention.\n","authors":["Joseph Spracklen","Raveen Wijewickrama","A H M Nazmus Sakib","Anindya Maiti","Bimal Viswanath","Murtuza Jadliwala"],"pdf_url":"https://arxiv.org/pdf/2406.10279v3.pdf","comment":"To appear in the 2025 USENIX Security Symposium. 22 pages, 14\n figures, 8 tables. Edited from original version for submission to a different\n conference. No change to original results or findings"},{"id":"http://arxiv.org/abs/2410.06232v3","updated":"2025-03-02T20:40:21Z","published":"2024-10-08T17:41:37Z","title":"Range, not Independence, Drives Modularity in Biologically Inspired\n Representations","summary":" Why do biological and artificial neurons sometimes modularise, each encoding\na single meaningful variable, and sometimes entangle their representation of\nmany variables? In this work, we develop a theory of when biologically inspired\nnetworks -- those that are nonnegative and energy efficient -- modularise their\nrepresentation of source variables (sources). We derive necessary and\nsufficient conditions on a sample of sources that determine whether the neurons\nin an optimal biologically-inspired linear autoencoder modularise. Our theory\napplies to any dataset, extending far beyond the case of statistical\nindependence studied in previous work. Rather we show that sources modularise\nif their support is ``sufficiently spread''. From this theory, we extract and\nvalidate predictions in a variety of empirical studies on how data distribution\naffects modularisation in nonlinear feedforward and recurrent neural networks\ntrained on supervised and unsupervised tasks. Furthermore, we apply these ideas\nto neuroscience data, showing that range independence can be used to understand\nthe mixing or modularising of spatial and reward information in entorhinal\nrecordings in seemingly conflicting experiments. Further, we use these results\nto suggest alternate origins of mixed-selectivity, beyond the predominant\ntheory of flexible nonlinear classification. In sum, our theory prescribes\nprecise conditions on when neural activities modularise, providing tools for\ninducing and elucidating modular representations in brains and machines.\n","authors":["Will Dorrell","Kyle Hsu","Luke Hollingsworth","Jin Hwa Lee","Jiajun Wu","Chelsea Finn","Peter E Latham","Tim EJ Behrens","James CR Whittington"],"pdf_url":"https://arxiv.org/pdf/2410.06232v3.pdf","comment":"47 pages, 17 figures. WD and KH contributed equally; LH and JHL\n contributed equally"},{"id":"http://arxiv.org/abs/2408.15905v2","updated":"2025-03-02T20:30:28Z","published":"2024-08-28T16:19:35Z","title":"MetaGFN: Exploring Distant Modes with Adapted Metadynamics for\n Continuous GFlowNets","summary":" Generative Flow Networks (GFlowNets) are a class of generative models that\nsample objects in proportion to a specified reward function through a learned\npolicy. They can be trained either on-policy or off-policy, needing a balance\nbetween exploration and exploitation for fast convergence to a target\ndistribution. While exploration strategies for discrete GFlowNets have been\nstudied, exploration in the continuous case remains to be investigated, despite\nthe potential for novel exploration algorithms due to the local connectedness\nof continuous domains. Here, we introduce Adapted Metadynamics, a variant of\nmetadynamics that can be applied to arbitrary black-box reward functions on\ncontinuous domains. We use Adapted Metadynamics as an exploration strategy for\ncontinuous GFlowNets. We show several continuous domains where the resulting\nalgorithm, MetaGFN, accelerates convergence to the target distribution and\ndiscovers more distant reward modes than previous off-policy exploration\nstrategies used for GFlowNets.\n","authors":["Dominic Phillips","Flaviu Cipcigan"],"pdf_url":"https://arxiv.org/pdf/2408.15905v2.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2502.12381v3","updated":"2025-03-02T20:17:56Z","published":"2025-02-17T23:40:27Z","title":"Linear Diffusion Networks","summary":" Diffusion kernels capture global dependencies. We present Linear Diffusion\nNetworks (LDNs), a novel architecture that reinterprets sequential data\nprocessing as a unified diffusion process. Our model integrates adaptive\ndiffusion modules with localized nonlinear updates and a diffusion-inspired\nattention mechanism. This design enables efficient global information\npropagation while preserving fine-grained temporal details. LDN overcomes the\nlimitations of conventional recurrent and transformer models by allowing full\nparallelization across time steps and supporting robust multi-scale temporal\nrepresentations. Experiments on benchmark sequence modeling tasks demonstrate\nthat LDN delivers competitive performance across ImageNet and GLUE tasks.\n","authors":["Jacob Fein-Ashley"],"pdf_url":"https://arxiv.org/pdf/2502.12381v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.05967v2","updated":"2025-03-02T20:16:43Z","published":"2025-02-09T17:31:09Z","title":"$μ$nit Scaling: Simple and Scalable FP8 LLM Training","summary":" Large Language Model training with 8-bit floating point (FP8) formats\npromises significant efficiency improvements, but reduced numerical precision\nmakes training challenging. It is currently possible to train in FP8 only if\none is willing to tune various hyperparameters, reduce model scale, or accept\nthe overhead of computing dynamic scale factors. We demonstrate simple,\nscalable FP8 training that requires no dynamic scaling factors or special\nhyperparameters, even at large model sizes. Our method, $\\mu$nit Scaling\n($\\mu$S), also enables simple hyperparameter transfer across model widths,\nmatched numerics across training and inference, and other desirable properties.\n$\\mu$nit Scaling is straightforward to implement, consisting of a set of\nminimal interventions based on a first-principles analysis of common\ntransformer operations. We validate our method by training models from 1B to\n13B parameters, performing all hidden linear layer computations in FP8. We\nachieve quality equal to higher precision baselines while also training up to\n33% faster.\n","authors":["Saaketh Narayan","Abhay Gupta","Mansheej Paul","Davis Blalock"],"pdf_url":"https://arxiv.org/pdf/2502.05967v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12534v2","updated":"2025-03-02T20:13:11Z","published":"2024-04-18T22:54:08Z","title":"Lean Copilot: Large Language Models as Copilots for Theorem Proving in\n Lean","summary":" Neural theorem proving combines large language models (LLMs) with proof\nassistants such as Lean, where the correctness of formal proofs can be\nrigorously verified, leaving no room for hallucination. With existing neural\ntheorem provers pretrained on a fixed collection of data and offering valuable\nsuggestions at times, it is challenging for them to continually prove novel\ntheorems in a fully autonomous mode, where human insights may be critical. In\nthis paper, we explore LLMs as copilots that assist humans in proving theorems.\nWe introduce Lean Copilot, an general framework for running LLM inference\nnatively in Lean. It enables programmers to build various LLM-based proof\nautomation tools that integrate seamlessly into the workflow of Lean users.\nLean users can use our pretrained models or bring their own ones that run\neither locally (with or without GPUs) or on the cloud. Using Lean Copilot, we\nbuild LLM-based tools that suggest proof steps, complete proof goals, and\nselect relevant premises. Experimental results on the Mathematics in Lean\ntextbook demonstrate the effectiveness of our method compared to existing\nrule-based proof automation in Lean (aesop). When assisting humans, Lean\nCopilot requires only 2.08 manually-entered proof steps on average (3.86\nrequired by aesop); when automating the theorem proving process, Lean Copilot\nautomates 74.2% proof steps on average, 85% better than aesop (40.1%). We open\nsource all code and artifacts under a permissive MIT license to facilitate\nfurther research.\n","authors":["Peiyang Song","Kaiyu Yang","Anima Anandkumar"],"pdf_url":"https://arxiv.org/pdf/2404.12534v2.pdf","comment":"All code and artifacts open-sourced at\n https://github.com/lean-dojo/LeanCopilot"},{"id":"http://arxiv.org/abs/2405.09660v3","updated":"2025-03-02T19:51:43Z","published":"2024-05-15T19:03:08Z","title":"Fast Two-Time-Scale Stochastic Gradient Method with Applications in\n Reinforcement Learning","summary":" Two-time-scale optimization is a framework introduced in Zeng et al. (2024)\nthat abstracts a range of policy evaluation and policy optimization problems in\nreinforcement learning (RL). Akin to bi-level optimization under a particular\ntype of stochastic oracle, the two-time-scale optimization framework has an\nupper level objective whose gradient evaluation depends on the solution of a\nlower level problem, which is to find the root of a strongly monotone operator.\nIn this work, we propose a new method for solving two-time-scale optimization\nthat achieves significantly faster convergence than the prior arts. The key\nidea of our approach is to leverage an averaging step to improve the estimates\nof the operators in both lower and upper levels before using them to update the\ndecision variables. These additional averaging steps eliminate the direct\ncoupling between the main variables, enabling the accelerated performance of\nour algorithm. We characterize the finite-time convergence rates of the\nproposed algorithm under various conditions of the underlying objective\nfunction, including strong convexity, Polyak-Lojasiewicz condition, and general\nnon-convexity. These rates significantly improve over the best-known complexity\nof the standard two-time-scale stochastic approximation algorithm. When applied\nto RL, we show how the proposed algorithm specializes to novel online\nsample-based methods that surpass or match the performance of the existing\nstate of the art. Finally, we support our theoretical results with numerical\nsimulations in RL.\n","authors":["Sihan Zeng","Thinh T. Doan"],"pdf_url":"https://arxiv.org/pdf/2405.09660v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.16103v4","updated":"2025-03-02T18:38:37Z","published":"2024-10-21T15:31:06Z","title":"LDAdam: Adaptive Optimization from Low-Dimensional Gradient Statistics","summary":" We introduce LDAdam, a memory-efficient optimizer for training large models,\nthat performs adaptive optimization steps within lower dimensional subspaces,\nwhile consistently exploring the full parameter space during training. This\nstrategy keeps the optimizer's memory footprint to a fraction of the model\nsize. LDAdam relies on a new projection-aware update rule for the optimizer\nstates that allows for transitioning between subspaces, i.e., estimation of the\nstatistics of the projected gradients. To mitigate the errors due to low-rank\nprojection, LDAdam integrates a new generalized error feedback mechanism, which\nexplicitly accounts for both gradient and optimizer state compression. We prove\nthe convergence of LDAdam under standard assumptions, and show that LDAdam\nallows for accurate and efficient fine-tuning and pre-training of language\nmodels. Code is available at https://github.com/IST-DASLab/LDAdam\n","authors":["Thomas Robert","Mher Safaryan","Ionut-Vlad Modoranu","Dan Alistarh"],"pdf_url":"https://arxiv.org/pdf/2410.16103v4.pdf","comment":"39 pages, ICLR 2025"},{"id":"http://arxiv.org/abs/2312.15289v3","updated":"2025-03-02T18:36:56Z","published":"2023-12-23T16:10:53Z","title":"Fréchet Wavelet Distance: A Domain-Agnostic Metric for Image\n Generation","summary":" Modern metrics for generative learning like Fr\\'echet Inception Distance\n(FID) and DINOv2-Fr\\'echet Distance (FD-DINOv2) demonstrate impressive\nperformance. However, they suffer from various shortcomings, like a bias\ntowards specific generators and datasets. To address this problem, we propose\nthe Fr\\'echet Wavelet Distance (FWD) as a domain-agnostic metric based on the\nWavelet Packet Transform ($W_p$). FWD provides a sight across a broad spectrum\nof frequencies in images with a high resolution, preserving both spatial and\ntextural aspects. Specifically, we use $W_p$ to project generated and real\nimages to the packet coefficient space. We then compute the Fr\\'echet distance\nwith the resultant coefficients to evaluate the quality of a generator. This\nmetric is general-purpose and dataset-domain agnostic, as it does not rely on\nany pre-trained network, while being more interpretable due to its ability to\ncompute Fr\\'echet distance per packet, enhancing transparency. We conclude with\nan extensive evaluation of a wide variety of generators across various datasets\nthat the proposed FWD can generalize and improve robustness to domain shifts\nand various corruptions compared to other metrics.\n","authors":["Lokesh Veeramacheneni","Moritz Wolter","Hildegard Kuehne","Juergen Gall"],"pdf_url":"https://arxiv.org/pdf/2312.15289v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02950v2","updated":"2025-03-02T18:31:59Z","published":"2024-08-06T04:28:16Z","title":"Kolmogorov-Arnold PointNet: Deep learning for prediction of fluid fields\n on irregular geometries","summary":" Kolmogorov-Arnold Networks (KANs) have emerged as a promising alternative to\ntraditional Multilayer Perceptrons (MLPs) in deep learning. KANs have already\nbeen integrated into various architectures, such as convolutional neural\nnetworks, graph neural networks, and transformers, and their potential has been\nassessed for predicting physical quantities. However, the combination of KANs\nwith point-cloud-based neural networks (e.g., PointNet) for computational\nphysics has not yet been explored. To address this, we present\nKolmogorov-Arnold PointNet (KA-PointNet) as a novel supervised deep learning\nframework for the prediction of incompressible steady-state fluid flow fields\nin irregular domains, where the predicted fields are a function of the geometry\nof the domains. In KA-PointNet, we implement shared KANs in the segmentation\nbranch of the PointNet architecture. We utilize Jacobi polynomials to construct\nshared KANs. As a benchmark test case, we consider incompressible laminar\nsteady-state flow over a cylinder, where the geometry of its cross-section\nvaries over the data set. We investigate the performance of Jacobi polynomials\nwith different degrees as well as special cases of Jacobi polynomials such as\nLegendre polynomials, Chebyshev polynomials of the first and second kinds, and\nGegenbauer polynomials, in terms of the computational cost of training and\naccuracy of prediction of the test set. Additionally, we compare the\nperformance of PointNet with shared KANs (i.e., KA-PointNet) and PointNet with\nshared MLPs. It is observed that when the number of trainable parameters is\napproximately equal, PointNet with shared KANs (i.e., KA-PointNet) outperforms\nPointNet with shared MLPs. Moreover, KA-PointNet predicts the pressure and\nvelocity distributions along the surface of cylinders more accurately,\nresulting in more precise computations of lift and drag.\n","authors":["Ali Kashefi"],"pdf_url":"https://arxiv.org/pdf/2408.02950v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14105v4","updated":"2025-03-02T18:24:29Z","published":"2024-05-23T02:14:17Z","title":"Distributed Speculative Inference (DSI): Speculation Parallelism for\n Provably Faster Lossless Language Model Inference","summary":" This paper introduces distributed speculative inference (DSI), a novel\ninference algorithm that is provably faster than speculative inference (SI)\n[leviathan2023, chen2023, miao2024, sun2025, timor2025] and standard\nautoregressive inference (non-SI). Like other SI algorithms, DSI operates on\nfrozen language models (LMs), requiring no training or architectural\nmodifications, and it preserves the target distribution. Prior studies on SI\nhave demonstrated empirical speedups over non-SI--but rely on sufficiently fast\nand accurate drafters, which are often unavailable in practice. We identify a\ngap where SI can be slower than non-SI if drafters are too slow or inaccurate.\nWe close this gap by proving that DSI is faster than both SI and non-SI--given\nany drafters. DSI is therefore not only faster than SI, but also unlocks the\nacceleration of LMs for which SI fails. DSI leverages speculation parallelism\n(SP), a novel type of task parallelism, to orchestrate target and drafter\ninstances that overlap in time, establishing a new foundational tradeoff\nbetween computational resources and latency. Our simulations show that DSI is\n1.29-1.92x faster than SI in single-node setups for various off-the-shelf LMs\nand tasks. We open-source all our code.\n","authors":["Nadav Timor","Jonathan Mamou","Daniel Korat","Moshe Berchansky","Oren Pereg","Moshe Wasserblat","Tomer Galanti","Michal Gordon","David Harel"],"pdf_url":"https://arxiv.org/pdf/2405.14105v4.pdf","comment":"Published at ICLR 2025. (Link:\n https://openreview.net/forum?id=cJd1BgZ9CS)"},{"id":"http://arxiv.org/abs/2411.10509v2","updated":"2025-03-02T18:17:14Z","published":"2024-11-15T15:39:04Z","title":"TESGNN: Temporal Equivariant Scene Graph Neural Networks for Efficient\n and Robust Multi-View 3D Scene Understanding","summary":" Scene graphs have proven to be highly effective for various scene\nunderstanding tasks due to their compact and explicit representation of\nrelational information. However, current methods often overlook the critical\nimportance of preserving symmetry when generating scene graphs from 3D point\nclouds, which can lead to reduced accuracy and robustness, particularly when\ndealing with noisy, multi-view data. Furthermore, a major limitation of prior\napproaches is the lack of temporal modeling to capture time-dependent\nrelationships among dynamically evolving entities in a scene. To address these\nchallenges, we propose Temporal Equivariant Scene Graph Neural Network\n(TESGNN), consisting of two key components: (1) an Equivariant Scene Graph\nNeural Network (ESGNN), which extracts information from 3D point clouds to\ngenerate scene graph while preserving crucial symmetry properties, and (2) a\nTemporal Graph Matching Network, which fuses scene graphs generated by ESGNN\nacross multiple time sequences into a unified global representation using an\napproximate graph-matching algorithm. Our combined architecture TESGNN\noutperforms current state-of-the-art methods in scene graph generation,\nachieving higher accuracy and faster training convergence. Moreover, we show\nthat leveraging the symmetry-preserving property produces a more stable and\naccurate global scene representation compared to existing approaches. Last but\nnot least, it is computationally efficient and easily implementable using\nexisting frameworks, making it well-suited for real-time applications in\nrobotics and computer vision. This approach paves the way for more robust and\nscalable solutions to complex multi-view scene understanding challenges. Our\nsource code is publicly available at: https://github.com/HySonLab/TESGraph\n","authors":["Quang P. M. Pham","Khoi T. N. Nguyen","Lan C. Ngo","Truong Do","Dezhen Song","Truong-Son Hy"],"pdf_url":"https://arxiv.org/pdf/2411.10509v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2407.00609"},{"id":"http://arxiv.org/abs/2407.13929v2","updated":"2025-03-02T18:17:11Z","published":"2024-07-18T22:33:52Z","title":"Unmasking Social Bots: How Confident Are We?","summary":" Social bots remain a major vector for spreading disinformation on social\nmedia and a menace to the public. Despite the progress made in developing\nmultiple sophisticated social bot detection algorithms and tools, bot detection\nremains a challenging, unsolved problem that is fraught with uncertainty due to\nthe heterogeneity of bot behaviors, training data, and detection algorithms.\nDetection models often disagree on whether to label the same account as bot or\nhuman-controlled. However, they do not provide any measure of uncertainty to\nindicate how much we should trust their results. We propose to address both bot\ndetection and the quantification of uncertainty at the account level - a novel\nfeature of this research. This dual focus is crucial as it allows us to\nleverage additional information related to the quantified uncertainty of each\nprediction, thereby enhancing decision-making and improving the reliability of\nbot classifications. Specifically, our approach facilitates targeted\ninterventions for bots when predictions are made with high confidence and\nsuggests caution (e.g., gathering more data) when predictions are uncertain.\n","authors":["James Giroux","Ariyarathne Gangani","Alexander C. Nwala","Cristiano Fanelli"],"pdf_url":"https://arxiv.org/pdf/2407.13929v2.pdf","comment":"15 pages, 6 figures, 6 tables"},{"id":"http://arxiv.org/abs/2410.04642v3","updated":"2025-03-02T18:16:48Z","published":"2024-10-06T22:30:14Z","title":"The Optimization Landscape of SGD Across the Feature Learning Strength","summary":" We consider neural networks (NNs) where the final layer is down-scaled by a\nfixed hyperparameter $\\gamma$. Recent work has identified $\\gamma$ as\ncontrolling the strength of feature learning. As $\\gamma$ increases, network\nevolution changes from \"lazy\" kernel dynamics to \"rich\" feature-learning\ndynamics, with a host of associated benefits including improved performance on\ncommon tasks. In this work, we conduct a thorough empirical investigation of\nthe effect of scaling $\\gamma$ across a variety of models and datasets in the\nonline training setting. We first examine the interaction of $\\gamma$ with the\nlearning rate $\\eta$, identifying several scaling regimes in the\n$\\gamma$-$\\eta$ plane which we explain theoretically using a simple model. We\nfind that the optimal learning rate $\\eta^*$ scales non-trivially with\n$\\gamma$. In particular, $\\eta^* \\propto \\gamma^2$ when $\\gamma \\ll 1$ and\n$\\eta^* \\propto \\gamma^{2/L}$ when $\\gamma \\gg 1$ for a feed-forward network of\ndepth $L$. Using this optimal learning rate scaling, we proceed with an\nempirical study of the under-explored \"ultra-rich\" $\\gamma \\gg 1$ regime. We\nfind that networks in this regime display characteristic loss curves, starting\nwith a long plateau followed by a drop-off, sometimes followed by one or more\nadditional staircase steps. We find networks of different large $\\gamma$ values\noptimize along similar trajectories up to a reparameterization of time. We\nfurther find that optimal online performance is often found at large $\\gamma$\nand could be missed if this hyperparameter is not tuned. Our findings indicate\nthat analytical study of the large-$\\gamma$ limit may yield useful insights\ninto the dynamics of representation learning in performant models.\n","authors":["Alexander Atanasov","Alexandru Meterez","James B. Simon","Cengiz Pehlevan"],"pdf_url":"https://arxiv.org/pdf/2410.04642v3.pdf","comment":"ICLR 2025 Final Copy, 40 Pages, 45 figures"},{"id":"http://arxiv.org/abs/2408.08531v2","updated":"2025-03-02T18:15:48Z","published":"2024-08-16T04:57:54Z","title":"Detecting Unsuccessful Students in Cybersecurity Exercises in Two\n Different Learning Environments","summary":" This full paper in the research track evaluates the usage of data logged from\ncybersecurity exercises in order to predict students who are potentially at\nrisk of performing poorly. Hands-on exercises are essential for learning since\nthey enable students to practice their skills. In cybersecurity, hands-on\nexercises are often complex and require knowledge of many topics. Therefore,\nstudents may miss solutions due to gaps in their knowledge and become\nfrustrated, which impedes their learning. Targeted aid by the instructor helps,\nbut since the instructor's time is limited, efficient ways to detect struggling\nstudents are needed. This paper develops automated tools to predict when a\nstudent is having difficulty. We formed a dataset with the actions of 313\nstudents from two countries and two learning environments: KYPO CRP and\nEDURange. These data are used in machine learning algorithms to predict the\nsuccess of students in exercises deployed in these environments. After\nextracting features from the data, we trained and cross-validated eight\nclassifiers for predicting the exercise outcome and evaluated their predictive\npower. The contribution of this paper is comparing two approaches to feature\nengineering, modeling, and classification performance on data from two learning\nenvironments. Using the features from either learning environment, we were able\nto detect and distinguish between successful and struggling students. A\ndecision tree classifier achieved the highest balanced accuracy and sensitivity\nwith data from both learning environments. The results show that activity data\nfrom cybersecurity exercises are suitable for predicting student success. In a\npotential application, such models can aid instructors in detecting struggling\nstudents and providing targeted help. We publish data and code for building\nthese models so that others can adopt or adapt them.\n","authors":["Valdemar Švábenský","Kristián Tkáčik","Aubrey Birdwell","Richard Weiss","Ryan S. Baker","Pavel Čeleda","Jan Vykopal","Jens Mache","Ankur Chattopadhyay"],"pdf_url":"https://arxiv.org/pdf/2408.08531v2.pdf","comment":"Published in the FIE 2024 conference proceedings, see\n https://doi.org/10.1109/FIE61694.2024.10893135"},{"id":"http://arxiv.org/abs/2410.11112v5","updated":"2025-03-02T17:48:06Z","published":"2024-10-14T21:43:48Z","title":"Differentiable Weightless Neural Networks","summary":" We introduce the Differentiable Weightless Neural Network (DWN), a model\nbased on interconnected lookup tables. Training of DWNs is enabled by a novel\nExtended Finite Difference technique for approximate differentiation of binary\nvalues. We propose Learnable Mapping, Learnable Reduction, and Spectral\nRegularization to further improve the accuracy and efficiency of these models.\nWe evaluate DWNs in three edge computing contexts: (1) an FPGA-based hardware\naccelerator, where they demonstrate superior latency, throughput, energy\nefficiency, and model area compared to state-of-the-art solutions, (2) a\nlow-power microcontroller, where they achieve preferable accuracy to XGBoost\nwhile subject to stringent memory constraints, and (3) ultra-low-cost chips,\nwhere they consistently outperform small models in both accuracy and projected\nhardware area. DWNs also compare favorably against leading approaches for\ntabular datasets, with higher average rank. Overall, our work positions DWNs as\na pioneering solution for edge-compatible high-throughput neural networks.\n","authors":["Alan T. L. Bacellar","Zachary Susskind","Mauricio Breternitz Jr.","Eugene John","Lizy K. John","Priscila M. V. Lima","Felipe M. G. França"],"pdf_url":"https://arxiv.org/pdf/2410.11112v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02372v2","updated":"2025-03-02T17:34:53Z","published":"2024-11-04T18:40:46Z","title":"Learning General-Purpose Biomedical Volume Representations using\n Randomized Synthesis","summary":" Current volumetric biomedical foundation models struggle to generalize as\npublic 3D datasets are small and do not cover the broad diversity of medical\nprocedures, conditions, anatomical regions, and imaging protocols. We address\nthis by creating a representation learning method that instead anticipates\nstrong domain shifts at training time itself. We first propose a data engine\nthat synthesizes highly variable training samples that would enable\ngeneralization to new biomedical contexts. To then train a single 3D network\nfor any voxel-level task, we develop a contrastive learning method that\npretrains the network to be stable against nuisance imaging variation simulated\nby the data engine, a key inductive bias for generalization. This network's\nfeatures can be used as robust representations of input images for downstream\ntasks and its weights provide a strong, dataset-agnostic initialization for\nfinetuning on new datasets. As a result, we set new standards across both\nmultimodality registration and few-shot segmentation, a first for any 3D\nbiomedical vision model, all without (pre-)training on any existing dataset of\nreal images.\n","authors":["Neel Dey","Benjamin Billot","Hallee E. Wong","Clinton J. Wang","Mengwei Ren","P. Ellen Grant","Adrian V. Dalca","Polina Golland"],"pdf_url":"https://arxiv.org/pdf/2411.02372v2.pdf","comment":"ICLR 2025: International Conference on Learning Representations. Code\n and model weights available at https://github.com/neel-dey/anatomix.\n Keywords: synthetic data, representation learning, medical image analysis,\n image registration, image segmentation"},{"id":"http://arxiv.org/abs/2403.08743v2","updated":"2025-03-02T17:33:03Z","published":"2024-03-13T17:46:28Z","title":"Prompting Fairness: Integrating Causality to Debias Large Language\n Models","summary":" Large language models (LLMs), despite their remarkable capabilities, are\nsusceptible to generating biased and discriminatory responses. As LLMs\nincreasingly influence high-stakes decision-making (e.g., hiring and\nhealthcare), mitigating these biases becomes critical. In this work, we propose\na causality-guided debiasing framework to tackle social biases, aiming to\nreduce the objectionable dependence between LLMs' decisions and the social\ninformation in the input. Our framework introduces a novel perspective to\nidentify how social information can affect an LLM's decision through different\ncausal pathways. Leveraging these causal insights, we outline principled\nprompting strategies that regulate these pathways through selection mechanisms.\nThis framework not only unifies existing prompting-based debiasing techniques,\nbut also opens up new directions for reducing bias by encouraging the model to\nprioritize fact-based reasoning over reliance on biased social cues. We\nvalidate our framework through extensive experiments on real-world datasets\nacross multiple domains, demonstrating its effectiveness in debiasing LLM\ndecisions, even with only black-box access to the model.\n","authors":["Jingling Li","Zeyu Tang","Xiaoyu Liu","Peter Spirtes","Kun Zhang","Liu Leqi","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2403.08743v2.pdf","comment":"24 pages, 10 figures"},{"id":"http://arxiv.org/abs/2502.03391v3","updated":"2025-03-02T17:32:48Z","published":"2025-02-05T17:29:12Z","title":"Explain Yourself, Briefly! Self-Explaining Neural Networks with Concise\n Sufficient Reasons","summary":" *Minimal sufficient reasons* represent a prevalent form of explanation - the\nsmallest subset of input features which, when held constant at their\ncorresponding values, ensure that the prediction remains unchanged. Previous\n*post-hoc* methods attempt to obtain such explanations but face two main\nlimitations: (1) Obtaining these subsets poses a computational challenge,\nleading most scalable methods to converge towards suboptimal, less meaningful\nsubsets; (2) These methods heavily rely on sampling out-of-distribution input\nassignments, potentially resulting in counterintuitive behaviors. To tackle\nthese limitations, we propose in this work a self-supervised training approach,\nwhich we term *sufficient subset training* (SST). Using SST, we train models to\ngenerate concise sufficient reasons for their predictions as an integral part\nof their output. Our results indicate that our framework produces succinct and\nfaithful subsets substantially more efficiently than competing post-hoc\nmethods, while maintaining comparable predictive performance.\n","authors":["Shahaf Bassan","Ron Eliav","Shlomit Gur"],"pdf_url":"https://arxiv.org/pdf/2502.03391v3.pdf","comment":"To appear in ICLR 2025"},{"id":"http://arxiv.org/abs/2410.06262v2","updated":"2025-03-02T17:20:26Z","published":"2024-10-08T18:02:29Z","title":"SymDiff: Equivariant Diffusion via Stochastic Symmetrisation","summary":" We propose SymDiff, a method for constructing equivariant diffusion models\nusing the framework of stochastic symmetrisation. SymDiff resembles a learned\ndata augmentation that is deployed at sampling time, and is lightweight,\ncomputationally efficient, and easy to implement on top of arbitrary\noff-the-shelf models. In contrast to previous work, SymDiff typically does not\nrequire any neural network components that are intrinsically equivariant,\navoiding the need for complex parameterisations or the use of higher-order\ngeometric features. Instead, our method can leverage highly scalable modern\narchitectures as drop-in replacements for these more constrained alternatives.\nWe show that this additional flexibility yields significant empirical benefit\nfor $\\mathrm{E}(3)$-equivariant molecular generation. To the best of our\nknowledge, this is the first application of symmetrisation to generative\nmodelling, suggesting its potential in this domain more generally.\n","authors":["Leo Zhang","Kianoosh Ashouritaklimi","Yee Whye Teh","Rob Cornish"],"pdf_url":"https://arxiv.org/pdf/2410.06262v2.pdf","comment":"Camera-ready version for ICLR 2025"},{"id":"http://arxiv.org/abs/2410.04810v2","updated":"2025-03-02T17:18:04Z","published":"2024-10-07T07:45:18Z","title":"FedBiP: Heterogeneous One-Shot Federated Learning with Personalized\n Latent Diffusion Models","summary":" One-Shot Federated Learning (OSFL), a special decentralized machine learning\nparadigm, has recently gained significant attention. OSFL requires only a\nsingle round of client data or model upload, which reduces communication costs\nand mitigates privacy threats compared to traditional FL. Despite these\npromising prospects, existing methods face challenges due to client data\nheterogeneity and limited data quantity when applied to real-world OSFL\nsystems. Recently, Latent Diffusion Models (LDM) have shown remarkable\nadvancements in synthesizing high-quality images through pretraining on\nlarge-scale datasets, thereby presenting a potential solution to overcome these\nissues. However, directly applying pretrained LDM to heterogeneous OSFL results\nin significant distribution shifts in synthetic data, leading to performance\ndegradation in classification models trained on such data. This issue is\nparticularly pronounced in rare domains, such as medical imaging, which are\nunderrepresented in LDM's pretraining data. To address this challenge, we\npropose Federated Bi-Level Personalization (FedBiP), which personalizes the\npretrained LDM at both instance-level and concept-level. Hereby, FedBiP\nsynthesizes images following the client's local data distribution without\ncompromising the privacy regulations. FedBiP is also the first approach to\nsimultaneously address feature space heterogeneity and client data scarcity in\nOSFL. Our method is validated through extensive experiments on three OSFL\nbenchmarks with feature space heterogeneity, as well as on challenging medical\nand satellite image datasets with label heterogeneity. The results demonstrate\nthe effectiveness of FedBiP, which substantially outperforms other OSFL\nmethods.\n","authors":["Haokun Chen","Hang Li","Yao Zhang","Jinhe Bi","Gengyuan Zhang","Yueqi Zhang","Philip Torr","Jindong Gu","Denis Krompass","Volker Tresp"],"pdf_url":"https://arxiv.org/pdf/2410.04810v2.pdf","comment":"CVPR 2025"},{"id":"http://arxiv.org/abs/2410.03011v2","updated":"2025-03-02T17:17:22Z","published":"2024-10-03T21:42:21Z","title":"Towards Understanding the Universality of Transformers for Next-Token\n Prediction","summary":" Causal Transformers are trained to predict the next token for a given\ncontext. While it is widely accepted that self-attention is crucial for\nencoding the causal structure of sequences, the precise underlying mechanism\nbehind this in-context autoregressive learning ability remains unclear. In this\npaper, we take a step towards understanding this phenomenon by studying the\napproximation ability of Transformers for next-token prediction. Specifically,\nwe explore the capacity of causal Transformers to predict the next token\n$x_{t+1}$ given an autoregressive sequence $(x_1, \\dots, x_t)$ as a prompt,\nwhere $ x_{t+1} = f(x_t) $, and $ f $ is a context-dependent function that\nvaries with each sequence. On the theoretical side, we focus on specific\ninstances, namely when $ f $ is linear or when $ (x_t)_{t \\geq 1} $ is\nperiodic. We explicitly construct a Transformer (with linear, exponential, or\nsoftmax attention) that learns the mapping $f$ in-context through a causal\nkernel descent method. The causal kernel descent method we propose provably\nestimates $x_{t+1} $ based solely on past and current observations $ (x_1,\n\\dots, x_t) $, with connections to the Kaczmarz algorithm in Hilbert spaces. We\npresent experimental results that validate our theoretical findings and suggest\ntheir applicability to more general mappings $f$.\n","authors":["Michael E. Sander","Gabriel Peyré"],"pdf_url":"https://arxiv.org/pdf/2410.03011v2.pdf","comment":"ICLR 2025, 20 pages"},{"id":"http://arxiv.org/abs/2502.11882v3","updated":"2025-03-02T17:15:11Z","published":"2025-02-17T15:09:45Z","title":"Leveraging Dual Process Theory in Language Agent Framework for Real-time\n Simultaneous Human-AI Collaboration","summary":" Agents built on large language models (LLMs) have excelled in turn-by-turn\nhuman-AI collaboration but struggle with simultaneous tasks requiring real-time\ninteraction. Latency issues and the challenge of inferring variable human\nstrategies hinder their ability to make autonomous decisions without explicit\ninstructions. Through experiments with current independent System 1 and System\n2 methods, we validate the necessity of using Dual Process Theory (DPT) in\nreal-time tasks. We propose DPT-Agent, a novel language agent framework that\nintegrates System 1 and System 2 for efficient real-time simultaneous human-AI\ncollaboration. DPT-Agent's System 1 uses a Finite-state Machine (FSM) and\ncode-as-policy for fast, intuitive, and controllable decision-making.\nDPT-Agent's System 2 integrates Theory of Mind (ToM) and asynchronous\nreflection to infer human intentions and perform reasoning-based autonomous\ndecisions. We demonstrate the effectiveness of DPT-Agent through further\nexperiments with rule-based agents and human collaborators, showing significant\nimprovements over mainstream LLM-based frameworks. DPT-Agent can effectively\nhelp LLMs convert correct slow thinking and reasoning into executable actions,\nthereby improving performance. To the best of our knowledge, DPT-Agent is the\nfirst language agent framework that achieves successful real-time simultaneous\nhuman-AI collaboration autonomously. Code of DPT-Agent can be found in\nhttps://github.com/sjtu-marl/DPT-Agent.\n","authors":["Shao Zhang","Xihuai Wang","Wenhao Zhang","Chaoran Li","Junru Song","Tingyu Li","Lin Qiu","Xuezhi Cao","Xunliang Cai","Wen Yao","Weinan Zhang","Xinbing Wang","Ying Wen"],"pdf_url":"https://arxiv.org/pdf/2502.11882v3.pdf","comment":"Preprint under review. Update the experimental results of the\n DeepSeek-R1 series models, o3-mini-high and o3-mini-medium"},{"id":"http://arxiv.org/abs/2403.18035v4","updated":"2025-03-02T16:41:49Z","published":"2024-03-26T18:40:36Z","title":"Bidirectional Consistency Models","summary":" Diffusion models (DMs) are capable of generating remarkably high-quality\nsamples by iteratively denoising a random vector, a process that corresponds to\nmoving along the probability flow ordinary differential equation (PF ODE).\nInterestingly, DMs can also invert an input image to noise by moving backward\nalong the PF ODE, a key operation for downstream tasks such as interpolation\nand image editing. However, the iterative nature of this process restricts its\nspeed, hindering its broader application. Recently, Consistency Models (CMs)\nhave emerged to address this challenge by approximating the integral of the PF\nODE, largely reducing the number of iterations. Yet, the absence of an explicit\nODE solver complicates the inversion process. To resolve this, we introduce\nBidirectional Consistency Model (BCM), which learns a single neural network\nthat enables both forward and backward traversal along the PF ODE, efficiently\nunifying generation and inversion tasks within one framework. We can train BCM\nfrom scratch or tune it using a pretrained consistency model, which reduces the\ntraining cost and increases scalability. We demonstrate that BCM enables\none-step generation and inversion while also allowing the use of additional\nsteps to enhance generation quality or reduce reconstruction error. We further\nshowcase BCM's capability in downstream tasks, such as interpolation and\ninpainting. Our code and weights are available at\nhttps://github.com/Mosasaur5526/BCM-iCT-torch.\n","authors":["Liangchen Li","Jiajun He"],"pdf_url":"https://arxiv.org/pdf/2403.18035v4.pdf","comment":"39 pages, 27 figures; a shorter version of this paper was acceppted\n at the ICML 2024 Workshop on Structured Probabilistic Inference & Generative\n Modeling"},{"id":"http://arxiv.org/abs/2412.07067v3","updated":"2025-03-02T16:40:03Z","published":"2024-12-10T00:19:28Z","title":"MoE-CAP: Benchmarking Cost, Accuracy and Performance of Sparse\n Mixture-of-Experts Systems","summary":" The Mixture-of-Experts (MoE) architecture is increasingly favored for scaling\nLarge Language Models (LLMs). Its key feature, sparse activation, selectively\nactivates only a subset of parameters (experts) per token, reducing memory\nbandwidth and compute FLOPs compared to dense models. To capitalize on this,\nMoE designers leverage heterogeneous compute and memory hardware to lower\nsystem costs. However, the interaction between model sparsity and hardware\nheterogeneity introduces trade-offs in Cost, Accuracy, and Performance (CAP).\nTo address this, we introduce MoE-CAP, a benchmarking method for evaluating\nsparse MoE systems across these three dimensions. Its key innovation is a\nsparsity-aware CAP analysis model, the first to integrate cost, performance,\nand accuracy metrics into a single diagram while estimating the impact of\nsparsity on system performance. MoE-CAP helps practitioners optimize hardware\nprovisioning for an MoE model-or vice versa. MoE-CAP supports various MoE\nmodels and provides more accurate metrics than existing methods.\n","authors":["Yao Fu","Yinsicheng Jiang","Yeqi Huang","Ping Nie","Zhan Lu","Leyang Xue","Congjie He","Man-Kit Sit","Jilong Xue","Li Dong","Ziming Miao","Kai Zou","Edoardo Ponti","Luo Mai"],"pdf_url":"https://arxiv.org/pdf/2412.07067v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16471v4","updated":"2025-03-02T16:36:29Z","published":"2023-08-31T05:26:14Z","title":"Foundational Policy Acquisition via Multitask Learning for Motor Skill\n Generation","summary":" In this study, we propose a multitask reinforcement learning algorithm for\nfoundational policy acquisition to generate novel motor skills.\n\\textcolor{\\hcolor}{Learning the rich representation of the multitask policy is\na challenge in dynamic movement generation tasks because the policy needs to\ncope with changes in goals or environments with different reward functions or\nphysical parameters. Inspired by human sensorimotor adaptation mechanisms, we\ndeveloped the learning pipeline to construct the encoder-decoder networks and\nnetwork selection to facilitate foundational policy acquisition under multiple\nsituations. First, we compared the proposed method with previous multitask\nreinforcement learning methods in the standard multi-locomotion tasks. The\nresults showed that the proposed approach outperformed the baseline methods.\nThen, we applied the proposed method to the ball heading task using a monopod\nrobot model to evaluate skill generation performance. The results showed that\nthe proposed method was able to adapt to novel target positions or\ninexperienced ball restitution coefficients but to acquire a foundational\npolicy network, originally learned for heading motion, which can generate an\nentirely new overhead kicking skill.\n","authors":["Satoshi Yamamori","Jun Morimoto"],"pdf_url":"https://arxiv.org/pdf/2308.16471v4.pdf","comment":"11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2406.03735v2","updated":"2025-03-02T16:21:37Z","published":"2024-06-06T04:19:55Z","title":"Phase-Amplitude Reduction-Based Imitation Learning","summary":" In this study, we propose the use of the phase-amplitude reduction method to\nconstruct an imitation learning framework. Imitating human movement\ntrajectories is recognized as a promising strategy for generating a range of\nhuman-like robot movements. Unlike previous dynamical system-based imitation\nlearning approaches, our proposed method allows the robot not only to imitate a\nlimit cycle trajectory but also to replicate the transient movement from the\ninitial or disturbed state to the limit cycle. Consequently, our method offers\na safer imitation learning approach that avoids generating unpredictable\nmotions immediately after disturbances or from a specified initial state. We\nfirst validated our proposed method by reconstructing a simple limit-cycle\nattractor. We then compared the proposed approach with a conventional method on\na lemniscate trajectory tracking task with a simulated robot arm. Our findings\nconfirm that our proposed method can more accurately generate transient\nmovements to converge on a target periodic attractor compared to the previous\nstandard approach. Subsequently, we applied our method to a real robot arm to\nimitate periodic human movements.\n","authors":["Satoshi Yamamori","Jun Morimoto"],"pdf_url":"https://arxiv.org/pdf/2406.03735v2.pdf","comment":"21 pages, 8 figures"},{"id":"http://arxiv.org/abs/2502.18960v2","updated":"2025-03-02T16:14:51Z","published":"2025-02-26T09:17:04Z","title":"Nonparametric Heterogeneous Long-term Causal Effect Estimation via Data\n Combination","summary":" Long-term causal inference has drawn increasing attention in many scientific\ndomains. Existing methods mainly focus on estimating average long-term causal\neffects by combining long-term observational data and short-term experimental\ndata. However, it is still understudied how to robustly and effectively\nestimate heterogeneous long-term causal effects, significantly limiting\npractical applications. In this paper, we propose several two-stage style\nnonparametric estimators for heterogeneous long-term causal effect estimation,\nincluding propensity-based, regression-based, and multiple robust estimators.\nWe conduct a comprehensive theoretical analysis of their asymptotic properties\nunder mild assumptions, with the ultimate goal of building a better\nunderstanding of the conditions under which some estimators can be expected to\nperform better. Extensive experiments across several semi-synthetic and\nreal-world datasets validate the theoretical results and demonstrate the\neffectiveness of the proposed estimators.\n","authors":["Weilin Chen","Ruichu Cai","Junjie Wan","Zeqin Yang","José Miguel Hernández-Lobato"],"pdf_url":"https://arxiv.org/pdf/2502.18960v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.20138v5","updated":"2025-03-02T15:57:39Z","published":"2024-12-28T12:54:06Z","title":"TradingAgents: Multi-Agents LLM Financial Trading Framework","summary":" Significant progress has been made in automated problem-solving using\nsocieties of agents powered by large language models (LLMs). In finance,\nefforts have largely focused on single-agent systems handling specific tasks or\nmulti-agent frameworks independently gathering data. However, multi-agent\nsystems' potential to replicate real-world trading firms' collaborative\ndynamics remains underexplored. TradingAgents proposes a novel stock trading\nframework inspired by trading firms, featuring LLM-powered agents in\nspecialized roles such as fundamental analysts, sentiment analysts, technical\nanalysts, and traders with varied risk profiles. The framework includes Bull\nand Bear researcher agents assessing market conditions, a risk management team\nmonitoring exposure, and traders synthesizing insights from debates and\nhistorical data to make informed decisions. By simulating a dynamic,\ncollaborative trading environment, this framework aims to improve trading\nperformance. Detailed architecture and extensive experiments reveal its\nsuperiority over baseline models, with notable improvements in cumulative\nreturns, Sharpe ratio, and maximum drawdown, highlighting the potential of\nmulti-agent LLM frameworks in financial trading. TradingAgents is available at\nhttps://github.com/PioneerFintech.\n","authors":["Yijia Xiao","Edward Sun","Di Luo","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2412.20138v5.pdf","comment":"Multi-Agent AI in the Real World @ AAAI 2025"},{"id":"http://arxiv.org/abs/2408.11915v2","updated":"2025-03-02T15:55:14Z","published":"2024-08-21T18:06:15Z","title":"Video-Foley: Two-Stage Video-To-Sound Generation via Temporal Event\n Condition For Foley Sound","summary":" Foley sound synthesis is crucial for multimedia production, enhancing user\nexperience by synchronizing audio and video both temporally and semantically.\nRecent studies on automating this labor-intensive process through\nvideo-to-sound generation face significant challenges. Systems lacking explicit\ntemporal features suffer from poor alignment and controllability, while\ntimestamp-based models require costly and subjective human annotation. We\npropose Video-Foley, a video-to-sound system using Root Mean Square (RMS) as an\nintuitive condition with semantic timbre prompts (audio or text). RMS, a\nframe-level intensity envelope closely related to audio semantics, acts as a\ntemporal event feature to guide audio generation from video. The\nannotation-free self-supervised learning framework consists of two stages,\nVideo2RMS and RMS2Sound, incorporating novel ideas including RMS discretization\nand RMS-ControlNet with a pretrained text-to-audio model. Our extensive\nevaluation shows that Video-Foley achieves state-of-the-art performance in\naudio-visual alignment and controllability for sound timing, intensity, timbre,\nand nuance. Source code, model weights and demos are available on our companion\nwebsite. (https://jnwnlee.github.io/video-foley-demo)\n","authors":["Junwon Lee","Jaekwon Im","Dabin Kim","Juhan Nam"],"pdf_url":"https://arxiv.org/pdf/2408.11915v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18709v4","updated":"2025-03-02T15:37:39Z","published":"2023-10-28T13:37:52Z","title":"Audio-Visual Instance Segmentation","summary":" In this paper, we propose a new multi-modal task, termed audio-visual\ninstance segmentation (AVIS), which aims to simultaneously identify, segment\nand track individual sounding object instances in audible videos. To facilitate\nthis research, we introduce a high-quality benchmark named AVISeg, containing\nover 90K instance masks from 26 semantic categories in 926 long videos.\nAdditionally, we propose a strong baseline model for this task. Our model first\nlocalizes sound source within each frame, and condenses object-specific\ncontexts into concise tokens. Then it builds long-range audio-visual\ndependencies between these tokens using window-based attention, and tracks\nsounding objects among the entire video sequences. Extensive experiments reveal\nthat our method performs best on AVISeg, surpassing the existing methods from\nrelated tasks. We further conduct the evaluation on several multi-modal large\nmodels. Unfortunately, they exhibits subpar performance on instance-level sound\nsource localization and temporal perception. We expect that AVIS will inspire\nthe community towards a more comprehensive multi-modal understanding. Dataset\nand code is available at https://github.com/ruohaoguo/avis.\n","authors":["Ruohao Guo","Xianghua Ying","Yaru Chen","Dantong Niu","Guangyao Li","Liao Qu","Yanyu Qi","Jinxing Zhou","Bowei Xing","Wenzhen Yue","Ji Shi","Qixun Wang","Peiliang Zhang","Buwen Liang"],"pdf_url":"https://arxiv.org/pdf/2310.18709v4.pdf","comment":"Accepted by CVPR 2025"},{"id":"http://arxiv.org/abs/2412.12164v2","updated":"2025-03-02T15:12:38Z","published":"2024-12-11T19:12:22Z","title":"GAMED: Knowledge Adaptive Multi-Experts Decoupling for Multimodal Fake\n News Detection","summary":" Multimodal fake news detection often involves modelling heterogeneous data\nsources, such as vision and language. Existing detection methods typically rely\non fusion effectiveness and cross-modal consistency to model the content,\ncomplicating understanding how each modality affects prediction accuracy.\nAdditionally, these methods are primarily based on static feature modelling,\nmaking it difficult to adapt to the dynamic changes and relationships between\ndifferent data modalities. This paper develops a significantly novel approach,\nGAMED, for multimodal modelling, which focuses on generating distinctive and\ndiscriminative features through modal decoupling to enhance cross-modal\nsynergies, thereby optimizing overall performance in the detection process.\nGAMED leverages multiple parallel expert networks to refine features and\npre-embed semantic knowledge to improve the experts' ability in information\nselection and viewpoint sharing. Subsequently, the feature distribution of each\nmodality is adaptively adjusted based on the respective experts' opinions.\nGAMED also introduces a novel classification technique to dynamically manage\ncontributions from different modalities, while improving the explainability of\ndecisions. Experimental results on the Fakeddit and Yang datasets demonstrate\nthat GAMED performs better than recently developed state-of-the-art models. The\nsource code can be accessed at https://github.com/slz0925/GAMED.\n","authors":["Lingzhi Shen","Yunfei Long","Xiaohao Cai","Imran Razzak","Guanming Chen","Kang Liu","Shoaib Jameel"],"pdf_url":"https://arxiv.org/pdf/2412.12164v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07189v2","updated":"2025-03-02T15:05:58Z","published":"2024-02-11T12:54:07Z","title":"Improving LSH via Tensorized Random Projection","summary":" Locality sensitive hashing (LSH) is a fundamental algorithmic toolkit used by\ndata scientists for approximate nearest neighbour search problems that have\nbeen used extensively in many large scale data processing applications such as\nnear duplicate detection, nearest neighbour search, clustering, etc. In this\nwork, we aim to propose faster and space efficient locality sensitive hash\nfunctions for Euclidean distance and cosine similarity for tensor data.\nTypically, the naive approach for obtaining LSH for tensor data involves first\nreshaping the tensor into vectors, followed by applying existing LSH methods\nfor vector data $E2LSH$ and $SRP$. However, this approach becomes impractical\nfor higher order tensors because the size of the reshaped vector becomes\nexponential in the order of the tensor. Consequently, the size of LSH\nparameters increases exponentially. To address this problem, we suggest two\nmethods for LSH for Euclidean distance and cosine similarity, namely\n$CP-E2LSH$, $TT-E2LSH$, and $CP-SRP$, $TT-SRP$, respectively, building on $CP$\nand tensor train $(TT)$ decompositions techniques. Our approaches are space\nefficient and can be efficiently applied to low rank $CP$ or $TT$ tensors. We\nprovide a rigorous theoretical analysis of our proposal on their correctness\nand efficacy.\n","authors":["Bhisham Dev Verma","Rameshwar Pratap"],"pdf_url":"https://arxiv.org/pdf/2402.07189v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14154v2","updated":"2025-03-02T14:41:12Z","published":"2024-07-19T09:34:04Z","title":"Where is the Testbed for my Federated Learning Research?","summary":" Progressing beyond centralized AI is of paramount importance, yet,\ndistributed AI solutions, in particular various federated learning (FL)\nalgorithms, are often not comprehensively assessed, which prevents the research\ncommunity from identifying the most promising approaches and practitioners from\nbeing convinced that a certain solution is deployment-ready. The largest hurdle\ntowards FL algorithm evaluation is the difficulty of conducting real-world\nexperiments over a variety of FL client devices and different platforms, with\ndifferent datasets and data distribution, all while assessing various\ndimensions of algorithm performance, such as inference accuracy, energy\nconsumption, and time to convergence, to name a few. In this paper, we present\nCoLExT, a real-world testbed for FL research. CoLExT is designed to streamline\nexperimentation with custom FL algorithms in a rich testbed configuration\nspace, with a large number of heterogeneous edge devices, ranging from\nsingle-board computers to smartphones, and provides real-time collection and\nvisualization of a variety of metrics through automatic instrumentation.\nAccording to our evaluation, porting FL algorithms to CoLExT requires minimal\ninvolvement from the developer, and the instrumentation introduces minimal\nresource usage overhead. Furthermore, through an initial investigation\ninvolving popular FL algorithms running on CoLExT, we reveal previously unknown\ntrade-offs, inefficiencies, and programming bugs.\n","authors":["Janez Božič","Amândio R. Faustino","Boris Radovič","Marco Canini","Veljko Pejović"],"pdf_url":"https://arxiv.org/pdf/2407.14154v2.pdf","comment":"SEC 2024"},{"id":"http://arxiv.org/abs/2410.10781v2","updated":"2025-03-02T14:37:53Z","published":"2024-10-14T17:50:28Z","title":"When Attention Sink Emerges in Language Models: An Empirical View","summary":" Language Models (LMs) assign significant attention to the first token, even\nif it is not semantically important, which is known as attention sink. This\nphenomenon has been widely adopted in applications such as streaming/long\ncontext generation, KV cache optimization, inference acceleration, model\nquantization, and others. Despite its widespread use, a deep understanding of\nattention sink in LMs is still lacking. In this work, we first demonstrate that\nattention sinks exist universally in LMs with various inputs, even in small\nmodels. Furthermore, attention sink is observed to emerge during the LM\npre-training, motivating us to investigate how optimization, data distribution,\nloss function, and model architecture in LM pre-training influence its\nemergence. We highlight that attention sink emerges after effective\noptimization on sufficient training data. The sink position is highly\ncorrelated with the loss function and data distribution. Most importantly, we\nfind that attention sink acts more like key biases, storing extra attention\nscores, which could be non-informative and not contribute to the value\ncomputation. We also observe that this phenomenon (at least partially) stems\nfrom tokens' inner dependence on attention scores as a result of softmax\nnormalization. After relaxing such dependence by replacing softmax attention\nwith other attention operations, such as sigmoid attention without\nnormalization, attention sinks do not emerge in LMs up to 1B parameters. The\ncode is available at https://github.com/sail-sg/Attention-Sink.\n","authors":["Xiangming Gu","Tianyu Pang","Chao Du","Qian Liu","Fengzhuo Zhang","Cunxiao Du","Ye Wang","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2410.10781v2.pdf","comment":"ICLR 2025 (Spotlight)"},{"id":"http://arxiv.org/abs/2410.07137v2","updated":"2025-03-02T14:28:33Z","published":"2024-10-09T17:53:06Z","title":"Cheating Automatic LLM Benchmarks: Null Models Achieve High Win Rates","summary":" Automatic LLM benchmarks, such as AlpacaEval 2.0, Arena-Hard-Auto, and\nMT-Bench, have become popular for evaluating language models due to their\ncost-effectiveness and scalability compared to human evaluation. Achieving high\nwin rates on these benchmarks can significantly boost the promotional impact of\nnewly released language models. This promotional benefit may motivate tricks,\nsuch as manipulating model output length or style to game win rates, even\nthough several mechanisms have been developed to control length and disentangle\nstyle to reduce gameability. Nonetheless, we show that even a \"null model\" that\nalways outputs a constant response (irrelevant to input instructions) can cheat\nautomatic benchmarks and achieve top-ranked win rates: an 86.5% LC win rate on\nAlpacaEval 2.0; an 83.0 score on Arena-Hard-Auto; and a 9.55 score on MT-Bench.\nMoreover, the crafted cheating outputs are transferable because we assume that\nthe instructions of these benchmarks (e.g., 805 samples of AlpacaEval 2.0) are\nprivate and cannot be accessed. While our experiments are primarily\nproof-of-concept, an adversary could use LLMs to generate more imperceptible\ncheating responses, unethically benefiting from high win rates and promotional\nimpact. Our findings call for the development of anti-cheating mechanisms for\nreliable automatic benchmarks. The code is available at\nhttps://github.com/sail-sg/Cheating-LLM-Benchmarks.\n","authors":["Xiaosen Zheng","Tianyu Pang","Chao Du","Qian Liu","Jing Jiang","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2410.07137v2.pdf","comment":"ICLR 2025 (Oral)"},{"id":"http://arxiv.org/abs/2410.16699v2","updated":"2025-03-02T14:18:13Z","published":"2024-10-22T05:11:45Z","title":"Graph Transformers Dream of Electric Flow","summary":" We show theoretically and empirically that the linear Transformer, when\napplied to graph data, can implement algorithms that solve canonical problems\nsuch as electric flow and eigenvector decomposition. The Transformer has access\nto information on the input graph only via the graph's incidence matrix. We\npresent explicit weight configurations for implementing each algorithm, and we\nbound the constructed Transformers' errors by the errors of the underlying\nalgorithms. Our theoretical findings are corroborated by experiments on\nsynthetic data. Additionally, on a real-world molecular regression task, we\nobserve that the linear Transformer is capable of learning a more effective\npositional encoding than the default one based on Laplacian eigenvectors. Our\nwork is an initial step towards elucidating the inner-workings of the\nTransformer for graph data. Code is available at\nhttps://github.com/chengxiang/LinearGraphTransformer\n","authors":["Xiang Cheng","Lawrence Carin","Suvrit Sra"],"pdf_url":"https://arxiv.org/pdf/2410.16699v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15252v2","updated":"2025-03-02T14:10:09Z","published":"2024-05-24T06:22:01Z","title":"Accelerating 3D Molecule Generation via Jointly Geometric Optimal\n Transport","summary":" This paper proposes a new 3D molecule generation framework, called GOAT, for\nfast and effective 3D molecule generation based on the flow-matching optimal\ntransport objective. Specifically, we formulate a geometric transport formula\nfor measuring the cost of mapping multi-modal features (e.g., continuous atom\ncoordinates and categorical atom types) between a base distribution and a\ntarget data distribution. Our formula is solved within a joint, equivariant,\nand smooth representation space. This is achieved by transforming the\nmulti-modal features into a continuous latent space with equivariant networks.\nIn addition, we find that identifying optimal distributional coupling is\nnecessary for fast and effective transport between any two distributions. We\nfurther propose a mechanism for estimating and purifying optimal coupling to\ntrain the flow model with optimal transport. By doing so, GOAT can turn\narbitrary distribution couplings into new deterministic couplings, leading to\nan estimated optimal transport plan for fast 3D molecule generation. The\npurification filters out the subpar molecules to ensure the ultimate generation\nquality. We theoretically and empirically prove that the proposed optimal\ncoupling estimation and purification yield transport plan with non-increasing\ncost. Finally, extensive experiments show that GOAT enjoys the efficiency of\nsolving geometric optimal transport, leading to a double speedup compared to\nthe sub-optimal method while achieving the best generation quality regarding\nvalidity, uniqueness, and novelty. The code is available at\nhttps://github.com/WanyuGroup/ICLR2025-GOAT.\n","authors":["Haokai Hong","Wanyu Lin","Kay Chen Tan"],"pdf_url":"https://arxiv.org/pdf/2405.15252v2.pdf","comment":"Published as a conference paper at ICLR 2025"},{"id":"http://arxiv.org/abs/2302.01310v3","updated":"2025-03-02T13:29:19Z","published":"2023-02-02T18:33:34Z","title":"Knowledge Gradient for Multi-Objective Bayesian Optimization with\n Decoupled Evaluations","summary":" Multi-objective Bayesian optimization aims to find the Pareto front of\ntrade-offs between a set of expensive objectives while collecting as few\nsamples as possible. In some cases, it is possible to evaluate the objectives\nseparately, and a different latency or evaluation cost can be associated with\neach objective. This decoupling of the objectives presents an opportunity to\nlearn the Pareto front faster by avoiding unnecessary, expensive evaluations.\nWe propose a scalarization based knowledge gradient acquisition function which\naccounts for the different evaluation costs of the objectives. We prove\nasymptotic consistency of the estimator of the optimum for an arbitrary,\nD-dimensional, real compact search space and show empirically that the\nalgorithm performs comparably with the state of the art and significantly\noutperforms versions which always evaluate both objectives.\n","authors":["Jack M. Buckingham","Sebastian Rojas Gonzalez","Juergen Branke"],"pdf_url":"https://arxiv.org/pdf/2302.01310v3.pdf","comment":"36 pages. This preprint has not undergone peer review (when\n applicable) or any post-submission improvements or corrections. The Version\n of Record of this contribution is published in 'Evolutionary Multi-Criterion\n Optimization', LNCS 15513, and is available online at\n https://doi.org/10.1007/978-981-96-3538-2_9"},{"id":"http://arxiv.org/abs/2411.12556v2","updated":"2025-03-02T13:29:03Z","published":"2024-11-19T15:15:45Z","title":"UMGAD: Unsupervised Multiplex Graph Anomaly Detection","summary":" Graph anomaly detection (GAD) is a critical task in graph machine learning,\nwith the primary objective of identifying anomalous nodes that deviate\nsignificantly from the majority. This task is widely applied in various\nreal-world scenarios, including fraud detection and social network analysis.\nHowever, existing GAD methods still face two major challenges: (1) They are\noften limited to detecting anomalies in single-type interaction graphs and\nstruggle with multiple interaction types in multiplex heterogeneous graphs. (2)\nIn unsupervised scenarios, selecting appropriate anomaly score thresholds\nremains a significant challenge for accurate anomaly detection. To address the\nabove challenges, we propose a novel Unsupervised Multiplex Graph Anomaly\nDetection method, named UMGAD. We first learn multi-relational correlations\namong nodes in multiplex heterogeneous graphs and capture anomaly information\nduring node attribute and structure reconstruction through graph-masked\nautoencoder (GMAE). Then, to further extract abnormal information, we generate\nattribute-level and subgraph-level augmented-view graphs respectively, and\nperform attribute and structure reconstruction through GMAE. Finally, we learn\nto optimize node attributes and structural features through contrastive\nlearning between original-view and augmented-view graphs to improve the model's\nability to capture anomalies. Meanwhile, we also propose a new anomaly score\nthreshold selection strategy, which allows the model to be independent of\nground truth information in real unsupervised scenarios. Extensive experiments\non four datasets show that our UMGAD significantly outperforms state-of-the-art\nmethods, achieving average improvements of 13.48% in AUC and 11.68% in Macro-F1\nacross all datasets.\n","authors":["Xiang Li","Jianpeng Qi","Zhongying Zhao","Guanjie Zheng","Lei Cao","Junyu Dong","Yanwei Yu"],"pdf_url":"https://arxiv.org/pdf/2411.12556v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.02820v2","updated":"2025-03-02T13:14:11Z","published":"2022-08-04T02:22:29Z","title":"MOVE: Effective and Harmless Ownership Verification via Embedded\n External Features","summary":" Currently, deep neural networks (DNNs) are widely adopted in different\napplications. Despite its commercial values, training a well-performing DNN is\nresource-consuming. Accordingly, the well-trained model is valuable\nintellectual property for its owner. However, recent studies revealed the\nthreats of model stealing, where the adversaries can obtain a function-similar\ncopy of the victim model, even when they can only query the model. In this\npaper, we propose an effective and harmless model ownership verification (MOVE)\nto defend against different types of model stealing simultaneously, without\nintroducing new security risks. In general, we conduct the ownership\nverification by verifying whether a suspicious model contains the knowledge of\ndefender-specified external features. Specifically, we embed the external\nfeatures by modifying a few training samples with style transfer. We then train\na meta-classifier to determine whether a model is stolen from the victim. This\napproach is inspired by the understanding that the stolen models should contain\nthe knowledge of features learned by the victim model. In particular,\n\\revision{we develop our MOVE method under both white-box and black-box\nsettings and analyze its theoretical foundation to provide comprehensive model\nprotection.} Extensive experiments on benchmark datasets verify the\neffectiveness of our method and its resistance to potential adaptive attacks.\nThe codes for reproducing the main experiments of our method are available at\nhttps://github.com/THUYimingLi/MOVE.\n","authors":["Yiming Li","Linghui Zhu","Xiaojun Jia","Yang Bai","Yong Jiang","Shu-Tao Xia","Xiaochun Cao","Kui Ren"],"pdf_url":"https://arxiv.org/pdf/2208.02820v2.pdf","comment":"This paper has been accepted by IEEE TPAMI 2025. It is the journal\n extension of our conference paper in AAAI 2022\n (https://ojs.aaai.org/index.php/AAAI/article/view/20036). 18 pages"},{"id":"http://arxiv.org/abs/2312.08671v2","updated":"2025-03-02T13:13:42Z","published":"2023-12-14T06:08:35Z","title":"Permutation-Invariant Graph Partitioning:How Graph Neural Networks\n Capture Structural Interactions?","summary":" Graph Neural Networks (GNNs) have paved the way for being a cornerstone in\ngraph-related learning tasks. Yet, the ability of GNNs to capture structural\ninteractions within graphs remains under-explored. In this work, we address\nthis gap by drawing on the insight that permutation invariant graph\npartitioning enables a powerful way of exploring structural interactions. We\nestablish theoretical connections between permutation invariant graph\npartitioning and graph isomorphism, and then propose Graph Partitioning Neural\nNetworks (GPNNs), a novel architecture that efficiently enhances the expressive\npower of GNNs in learning structural interactions. We analyze how partitioning\nschemes and structural interactions contribute to GNN expressivity and their\ntrade-offs with complexity. Empirically, we demonstrate that GPNNs outperform\nexisting GNN models in capturing structural interactions across diverse graph\nbenchmark tasks.\n","authors":["Asela Hevapathige","Qing Wang"],"pdf_url":"https://arxiv.org/pdf/2312.08671v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.15812v2","updated":"2025-03-02T12:28:24Z","published":"2024-06-22T10:36:04Z","title":"Intrinsic Dimension Correlation: uncovering nonlinear connections in\n multimodal representations","summary":" To gain insight into the mechanisms behind machine learning methods, it is\ncrucial to establish connections among the features describing data points.\nHowever, these correlations often exhibit a high-dimensional and strongly\nnonlinear nature, which makes them challenging to detect using standard\nmethods. This paper exploits the entanglement between intrinsic dimensionality\nand correlation to propose a metric that quantifies the (potentially nonlinear)\ncorrelation between high-dimensional manifolds. We first validate our method on\nsynthetic data in controlled environments, showcasing its advantages and\ndrawbacks compared to existing techniques. Subsequently, we extend our analysis\nto large-scale applications in neural network representations. Specifically, we\nfocus on latent representations of multimodal data, uncovering clear\ncorrelations between paired visual and textual embeddings, whereas existing\nmethods struggle significantly in detecting similarity. Our results indicate\nthe presence of highly nonlinear correlation patterns between latent manifolds.\n","authors":["Lorenzo Basile","Santiago Acevedo","Luca Bortolussi","Fabio Anselmi","Alex Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2406.15812v2.pdf","comment":"Accepted at ICLR 2025"},{"id":"http://arxiv.org/abs/2405.01229v2","updated":"2025-03-02T12:27:07Z","published":"2024-05-02T12:18:14Z","title":"Boosting Jailbreak Attack with Momentum","summary":" Large Language Models (LLMs) have achieved remarkable success across diverse\ntasks, yet they remain vulnerable to adversarial attacks, notably the\nwell-known jailbreak attack. In particular, the Greedy Coordinate Gradient\n(GCG) attack has demonstrated efficacy in exploiting this vulnerability by\noptimizing adversarial prompts through a combination of gradient heuristics and\ngreedy search. However, the efficiency of this attack has become a bottleneck\nin the attacking process. To mitigate this limitation, in this paper we rethink\nthe generation of the adversarial prompts through an optimization lens, aiming\nto stabilize the optimization process and harness more heuristic insights from\nprevious optimization iterations. Specifically, we propose the\n\\textbf{M}omentum \\textbf{A}ccelerated G\\textbf{C}G (\\textbf{MAC}) attack,\nwhich integrates a momentum term into the gradient heuristic to boost and\nstabilize the random search for tokens in adversarial prompts. Experimental\nresults showcase the notable enhancement achieved by MAC over baselines in\nterms of attack success rate and optimization efficiency. Moreover, we\ndemonstrate that MAC can still exhibit superior performance for transfer\nattacks and models under defense mechanisms. Our code is available at\nhttps://github.com/weizeming/momentum-attack-llm.\n","authors":["Yihao Zhang","Zeming Wei"],"pdf_url":"https://arxiv.org/pdf/2405.01229v2.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2412.05994v2","updated":"2025-03-02T12:21:49Z","published":"2024-12-08T16:58:29Z","title":"PIG: Physics-Informed Gaussians as Adaptive Parametric Mesh\n Representations","summary":" The numerical approximation of partial differential equations (PDEs) using\nneural networks has seen significant advancements through Physics-Informed\nNeural Networks (PINNs). Despite their straightforward optimization framework\nand flexibility in implementing various PDEs, PINNs often suffer from limited\naccuracy due to the spectral bias of Multi-Layer Perceptrons (MLPs), which\nstruggle to effectively learn high-frequency and nonlinear components.\nRecently, parametric mesh representations in combination with neural networks\nhave been investigated as a promising approach to eliminate the inductive bias\nof MLPs. However, they usually require high-resolution grids and a large number\nof collocation points to achieve high accuracy while avoiding overfitting. In\naddition, the fixed positions of the mesh parameters restrict their\nflexibility, making accurate approximation of complex PDEs challenging. To\novercome these limitations, we propose Physics-Informed Gaussians (PIGs), which\ncombine feature embeddings using Gaussian functions with a lightweight neural\nnetwork. Our approach uses trainable parameters for the mean and variance of\neach Gaussian, allowing for dynamic adjustment of their positions and shapes\nduring training. This adaptability enables our model to optimally approximate\nPDE solutions, unlike models with fixed parameter positions. Furthermore, the\nproposed approach maintains the same optimization framework used in PINNs,\nallowing us to benefit from their excellent properties. Experimental results\nshow the competitive performance of our model across various PDEs,\ndemonstrating its potential as a robust tool for solving complex PDEs. Our\nproject page is available at\nhttps://namgyukang.github.io/Physics-Informed-Gaussians/\n","authors":["Namgyu Kang","Jaemin Oh","Youngjoon Hong","Eunbyung Park"],"pdf_url":"https://arxiv.org/pdf/2412.05994v2.pdf","comment":"Project page:\n https://namgyukang.github.io/Physics-Informed-Gaussians/"},{"id":"http://arxiv.org/abs/2410.12025v2","updated":"2025-03-02T12:20:56Z","published":"2024-10-15T19:46:09Z","title":"Geometric Inductive Biases of Deep Networks: The Role of Data and\n Architecture","summary":" In this paper, we propose the $\\textit{geometric invariance hypothesis\n(GIH)}$, which argues that the input space curvature of a neural network\nremains invariant under transformation in certain architecture-dependent\ndirections during training. We investigate a simple, non-linear binary\nclassification problem residing on a plane in a high dimensional space and\nobserve that$\\unicode{x2014}$unlike MPLs$\\unicode{x2014}$ResNets fail to\ngeneralize depending on the orientation of the plane. Motivated by this\nexample, we define a neural network's $\\textbf{average geometry}$ and\n$\\textbf{average geometry evolution}$ as compact\n$\\textit{architecture-dependent}$ summaries of the model's input-output\ngeometry and its evolution during training. By investigating the average\ngeometry evolution at initialization, we discover that the geometry of a neural\nnetwork evolves according to the data covariance projected onto its average\ngeometry. This means that the geometry only changes in a subset of the input\nspace when the average geometry is low-rank, such as in ResNets. This causes an\narchitecture-dependent invariance property in the input space curvature, which\nwe dub GIH. Finally, we present extensive experimental results to observe the\nconsequences of GIH and how it relates to generalization in neural networks.\n","authors":["Sajad Movahedi","Antonio Orvieto","Seyed-Mohsen Moosavi-Dezfooli"],"pdf_url":"https://arxiv.org/pdf/2410.12025v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.14309v2","updated":"2025-03-02T12:04:35Z","published":"2025-02-20T06:51:42Z","title":"On Theoretical Limits of Learning with Label Differential Privacy","summary":" Label differential privacy (DP) is designed for learning problems involving\nprivate labels and public features. While various methods have been proposed\nfor learning under label DP, the theoretical limits remain largely unexplored.\nIn this paper, we investigate the fundamental limits of learning with label DP\nin both local and central models for both classification and regression tasks,\ncharacterized by minimax convergence rates. We establish lower bounds by\nconverting each task into a multiple hypothesis testing problem and bounding\nthe test error. Additionally, we develop algorithms that yield matching upper\nbounds. Our results demonstrate that under label local DP (LDP), the risk has a\nsignificantly faster convergence rate than that under full LDP, i.e. protecting\nboth features and labels, indicating the advantages of relaxing the DP\ndefinition to focus solely on labels. In contrast, under the label central DP\n(CDP), the risk is only reduced by a constant factor compared to full DP,\nindicating that the relaxation of CDP only has limited benefits on the\nperformance.\n","authors":["Puning Zhao","Chuan Ma","Li Shen","Shaowei Wang","Rongfei Fan"],"pdf_url":"https://arxiv.org/pdf/2502.14309v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04942v2","updated":"2025-03-02T11:55:15Z","published":"2024-07-06T03:22:57Z","title":"FOSP: Fine-tuning Offline Safe Policy through World Models","summary":" Offline Safe Reinforcement Learning (RL) seeks to address safety constraints\nby learning from static datasets and restricting exploration. However, these\napproaches heavily rely on the dataset and struggle to generalize to unseen\nscenarios safely. In this paper, we aim to improve safety during the deployment\nof vision-based robotic tasks through online fine-tuning an offline pretrained\npolicy. To facilitate effective fine-tuning, we introduce model-based RL, which\nis known for its data efficiency. Specifically, our method employs in-sample\noptimization to improve offline training efficiency while incorporating\nreachability guidance to ensure safety. After obtaining an offline safe policy,\na safe policy expansion approach is leveraged for online fine-tuning. The\nperformance of our method is validated on simulation benchmarks with five\nvision-only tasks and through real-world robot deployment using limited data.\nIt demonstrates that our approach significantly improves the generalization of\noffline policies to unseen safety-constrained scenarios. To the best of our\nknowledge, this is the first work to explore offline-to-online RL for safe\ngeneralization tasks.\n","authors":["Chenyang Cao","Yucheng Xin","Silang Wu","Longxiang He","Zichen Yan","Junbo Tan","Xueqian Wang"],"pdf_url":"https://arxiv.org/pdf/2407.04942v2.pdf","comment":"32 pages, ICLR2025"},{"id":"http://arxiv.org/abs/2411.02275v2","updated":"2025-03-02T11:48:40Z","published":"2024-11-04T17:05:37Z","title":"Breaking the Reclustering Barrier in Centroid-based Deep Clustering","summary":" This work investigates an important phenomenon in centroid-based deep\nclustering (DC) algorithms: Performance quickly saturates after a period of\nrapid early gains. Practitioners commonly address early saturation with\nperiodic reclustering, which we demonstrate to be insufficient to address\nperformance plateaus. We call this phenomenon the \"reclustering barrier\" and\nempirically show when the reclustering barrier occurs, what its underlying\nmechanisms are, and how it is possible to Break the Reclustering Barrier with\nour algorithm BRB. BRB avoids early over-commitment to initial clusterings and\nenables continuous adaptation to reinitialized clustering targets while\nremaining conceptually simple. Applying our algorithm to widely-used\ncentroid-based DC algorithms, we show that (1) BRB consistently improves\nperformance across a wide range of clustering benchmarks, (2) BRB enables\ntraining from scratch, and (3) BRB performs competitively against\nstate-of-the-art DC algorithms when combined with a contrastive loss. We\nrelease our code and pre-trained models at\nhttps://github.com/Probabilistic-and-Interactive-ML/breaking-the-reclustering-barrier .\n","authors":["Lukas Miklautz","Timo Klein","Kevin Sidak","Collin Leiber","Thomas Lang","Andrii Shkabrii","Sebastian Tschiatschek","Claudia Plant"],"pdf_url":"https://arxiv.org/pdf/2411.02275v2.pdf","comment":"Accepted at ICLR 2025 (Camera-ready version)"},{"id":"http://arxiv.org/abs/2407.05649v4","updated":"2025-03-02T11:37:49Z","published":"2024-07-08T06:21:56Z","title":"Greener GRASS: Enhancing GNNs with Encoding, Rewiring, and Attention","summary":" Graph Neural Networks (GNNs) have become important tools for machine learning\non graph-structured data. In this paper, we explore the synergistic combination\nof graph encoding, graph rewiring, and graph attention, by introducing Graph\nAttention with Stochastic Structures (GRASS), a novel GNN architecture. GRASS\nutilizes relative random walk probabilities (RRWP) encoding and a novel\ndecomposed variant (D-RRWP) to efficiently capture structural information. It\nrewires the input graph by superimposing a random regular graph to enhance\nlong-range information propagation. It also employs a novel additive attention\nmechanism tailored for graph-structured data. Our empirical evaluations\ndemonstrate that GRASS achieves state-of-the-art performance on multiple\nbenchmark datasets, including a 20.3% reduction in mean absolute error on the\nZINC dataset.\n","authors":["Tongzhou Liao","Barnabás Póczos"],"pdf_url":"https://arxiv.org/pdf/2407.05649v4.pdf","comment":"Published as a conference paper at ICLR 2025"},{"id":"http://arxiv.org/abs/2410.02242v2","updated":"2025-03-02T11:32:27Z","published":"2024-10-03T06:30:27Z","title":"Robust Weight Initialization for Tanh Neural Networks with Fixed Point\n Analysis","summary":" As a neural network's depth increases, it can improve generalization\nperformance. However, training deep networks is challenging due to gradient and\nsignal propagation issues. To address these challenges, extensive theoretical\nresearch and various methods have been introduced. Despite these advances,\neffective weight initialization methods for tanh neural networks remain\ninsufficiently investigated. This paper presents a novel weight initialization\nmethod for neural networks with tanh activation function. Based on an analysis\nof the fixed points of the function $\\tanh(ax)$, the proposed method aims to\ndetermine values of $a$ that mitigate activation saturation. A series of\nexperiments on various classification datasets and physics-informed neural\nnetworks demonstrates that the proposed method outperforms Xavier\ninitialization methods~(with or without normalization) in terms of robustness\nacross different network sizes, data efficiency, and convergence speed. Code is\navailable at https://github.com/1HyunwooLee/Tanh-Init\n","authors":["Hyunwoo Lee","Hayoung Choi","Hyunju Kim"],"pdf_url":"https://arxiv.org/pdf/2410.02242v2.pdf","comment":"ICLR 2025"},{"id":"http://arxiv.org/abs/2502.19649v2","updated":"2025-03-02T11:23:58Z","published":"2025-02-27T00:40:01Z","title":"Taxonomy, Opportunities, and Challenges of Representation Engineering\n for Large Language Models","summary":" Representation Engineering (RepE) is a novel paradigm for controlling the\nbehavior of LLMs. Unlike traditional approaches that modify inputs or fine-tune\nthe model, RepE directly manipulates the model's internal representations. As a\nresult, it may offer more effective, interpretable, data-efficient, and\nflexible control over models' behavior. We present the first comprehensive\nsurvey of RepE for LLMs, reviewing the rapidly growing literature to address\nkey questions: What RepE methods exist and how do they differ? For what\nconcepts and problems has RepE been applied? What are the strengths and\nweaknesses of RepE compared to other methods? To answer these, we propose a\nunified framework describing RepE as a pipeline comprising representation\nidentification, operationalization, and control. We posit that while RepE\nmethods offer significant potential, challenges remain, including managing\nmultiple concepts, ensuring reliability, and preserving models' performance.\nTowards improving RepE, we identify opportunities for experimental and\nmethodological improvements and construct a guide for best practices.\n","authors":["Jan Wehner","Sahar Abdelnabi","Daniel Tan","David Krueger","Mario Fritz"],"pdf_url":"https://arxiv.org/pdf/2502.19649v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.04803v4","updated":"2025-03-02T11:22:35Z","published":"2024-10-07T07:27:39Z","title":"Timer-XL: Long-Context Transformers for Unified Time Series Forecasting","summary":" We present Timer-XL, a causal Transformer for unified time series\nforecasting. To uniformly predict multidimensional time series, we generalize\nnext token prediction, predominantly adopted for 1D token sequences, to\nmultivariate next token prediction. The paradigm formulates various forecasting\ntasks as a long-context prediction problem. We opt for decoder-only\nTransformers that capture causal dependencies from varying-length contexts for\nunified forecasting, making predictions on non-stationary univariate time\nseries, multivariate series with complicated dynamics and correlations, as well\nas covariate-informed contexts that include exogenous variables. Technically,\nwe propose a universal TimeAttention to capture fine-grained intra- and\ninter-series dependencies of flattened time series tokens (patches), which is\nfurther enhanced by deft position embedding for temporal causality and variable\nequivalence. Timer-XL achieves state-of-the-art performance across\ntask-specific forecasting benchmarks through a unified approach. Based on\nlarge-scale pre-training, Timer-XL achieves state-of-the-art zero-shot\nperformance, making it a promising architecture for pre-trained time series\nmodels. Code is available at this repository:\nhttps://github.com/thuml/Timer-XL.\n","authors":["Yong Liu","Guo Qin","Xiangdong Huang","Jianmin Wang","Mingsheng Long"],"pdf_url":"https://arxiv.org/pdf/2410.04803v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02957v4","updated":"2025-03-02T10:59:52Z","published":"2024-03-05T13:25:44Z","title":"On the Asymptotic Mean Square Error Optimality of Diffusion Models","summary":" Diffusion models (DMs) as generative priors have recently shown great\npotential for denoising tasks but lack theoretical understanding with respect\nto their mean square error (MSE) optimality. This paper proposes a novel\ndenoising strategy inspired by the structure of the MSE-optimal conditional\nmean estimator (CME). The resulting DM-based denoiser can be conveniently\nemployed using a pre-trained DM, being particularly fast by truncating reverse\ndiffusion steps and not requiring stochastic re-sampling. We present a\ncomprehensive (non-)asymptotic optimality analysis of the proposed\ndiffusion-based denoiser, demonstrating polynomial-time convergence to the CME\nunder mild conditions. Our analysis also derives a novel Lipschitz constant\nthat depends solely on the DM's hyperparameters. Further, we offer a new\nperspective on DMs, showing that they inherently combine an asymptotically\noptimal denoiser with a powerful generator, modifiable by switching re-sampling\nin the reverse process on or off. The theoretical findings are thoroughly\nvalidated with experiments based on various benchmark datasets\n","authors":["Benedikt Fesl","Benedikt Böck","Florian Strasser","Michael Baur","Michael Joham","Wolfgang Utschick"],"pdf_url":"https://arxiv.org/pdf/2403.02957v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.13706v3","updated":"2025-03-02T10:52:11Z","published":"2022-12-28T05:43:57Z","title":"End-to-End Modeling Hierarchical Time Series Using Autoregressive\n Transformer and Conditional Normalizing Flow based Reconciliation","summary":" Multivariate time series forecasting with hierarchical structure is pervasive\nin real-world applications, demanding not only predicting each level of the\nhierarchy, but also reconciling all forecasts to ensure coherency, i.e., the\nforecasts should satisfy the hierarchical aggregation constraints. Moreover,\nthe disparities of statistical characteristics between levels can be huge,\nworsened by non-Gaussian distributions and non-linear correlations. To this\nextent, we propose a novel end-to-end hierarchical time series forecasting\nmodel, based on conditioned normalizing flow-based autoregressive transformer\nreconciliation, to represent complex data distribution while simultaneously\nreconciling the forecasts to ensure coherency. Unlike other state-of-the-art\nmethods, we achieve the forecasting and reconciliation simultaneously without\nrequiring any explicit post-processing step. In addition, by harnessing the\npower of deep model, we do not rely on any assumption such as unbiased\nestimates or Gaussian distribution. Our evaluation experiments are conducted on\nfour real-world hierarchical datasets from different industrial domains (three\npublic ones and a dataset from the application servers of Alipay's data center)\nand the preliminary results demonstrate efficacy of our proposed method.\n","authors":["Shiyu Wang","Fan Zhou","Yinbo Sun","Lintao Ma","James Zhang","Yangfei Zheng"],"pdf_url":"https://arxiv.org/pdf/2212.13706v3.pdf","comment":"Accepted by the 22nd IEEE International Conference on Data Mining\n (ICDM2022)"},{"id":"http://arxiv.org/abs/2402.05569v5","updated":"2025-03-02T10:48:32Z","published":"2024-02-08T11:10:39Z","title":"Training-Free Message Passing for Learning on Hypergraphs","summary":" Hypergraphs are crucial for modelling higher-order interactions in real-world\ndata. Hypergraph neural networks (HNNs) effectively utilise these structures by\nmessage passing to generate informative node features for various downstream\ntasks like node classification. However, the message passing module in existing\nHNNs typically requires a computationally intensive training process, which\nlimits their practical use. To tackle this challenge, we propose an alternative\napproach by decoupling the usage of hypergraph structural information from the\nmodel learning stage. This leads to a novel training-free message passing\nmodule, named TF-MP-Module, which can be precomputed in the data preprocessing\nstage, thereby reducing the computational burden. We refer to the hypergraph\nneural network equipped with our TF-MP-Module as TF-HNN. We theoretically\nsupport the efficiency and effectiveness of TF-HNN by showing that: 1) It is\nmore training-efficient compared to existing HNNs; 2) It utilises as much\ninformation as existing HNNs for node feature generation; and 3) It is robust\nagainst the oversmoothing issue while using long-range interactions.\nExperiments based on seven real-world hypergraph benchmarks in node\nclassification and hyperlink prediction show that, compared to state-of-the-art\nHNNs, TF-HNN exhibits both competitive performance and superior training\nefficiency. Specifically, on the large-scale benchmark, Trivago, TF-HNN\noutperforms the node classification accuracy of the best baseline by 10% with\njust 1% of the training time of that baseline.\n","authors":["Bohan Tang","Zexi Liu","Keyue Jiang","Siheng Chen","Xiaowen Dong"],"pdf_url":"https://arxiv.org/pdf/2402.05569v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14666v2","updated":"2025-03-02T10:38:32Z","published":"2024-10-18T17:56:11Z","title":"DiscoGraMS: Enhancing Movie Screen-Play Summarization using Movie\n Character-Aware Discourse Graph","summary":" Summarizing movie screenplays presents a unique set of challenges compared to\nstandard document summarization. Screenplays are not only lengthy, but also\nfeature a complex interplay of characters, dialogues, and scenes, with numerous\ndirect and subtle relationships and contextual nuances that are difficult for\nmachine learning models to accurately capture and comprehend. Recent attempts\nat screenplay summarization focus on fine-tuning transformer-based pre-trained\nmodels, but these models often fall short in capturing long-term dependencies\nand latent relationships, and frequently encounter the \"lost in the middle\"\nissue. To address these challenges, we introduce DiscoGraMS, a novel resource\nthat represents movie scripts as a movie character-aware discourse graph (CaD\nGraph). This approach is well-suited for various downstream tasks, such as\nsummarization, question-answering, and salience detection. The model aims to\npreserve all salient information, offering a more comprehensive and faithful\nrepresentation of the screenplay's content. We further explore a baseline\nmethod that combines the CaD Graph with the corresponding movie script through\na late fusion of graph and text modalities, and we present very initial\npromising results.\n","authors":["Maitreya Prafulla Chitale","Uday Bindal","Rajakrishnan Rajkumar","Rahul Mishra"],"pdf_url":"https://arxiv.org/pdf/2410.14666v2.pdf","comment":"Accepted at NAACL 2025 (Main)"},{"id":"http://arxiv.org/abs/2411.15216v2","updated":"2025-03-02T10:23:51Z","published":"2024-11-20T16:17:40Z","title":"Dist Loss: Enhancing Regression in Few-Shot Region through Distribution\n Distance Constraint","summary":" Imbalanced data distributions are prevalent in real-world scenarios, posing\nsignificant challenges in both imbalanced classification and imbalanced\nregression tasks. They often cause deep learning models to overfit in areas of\nhigh sample density (many-shot regions) while underperforming in areas of low\nsample density (few-shot regions). This characteristic restricts the utility of\ndeep learning models in various sectors, notably healthcare, where areas with\nfew-shot data hold greater clinical relevance. While recent studies have shown\nthe benefits of incorporating distribution information in imbalanced\nclassification tasks, such strategies are rarely explored in imbalanced\nregression. In this paper, we address this issue by introducing a novel loss\nfunction, termed Dist Loss, designed to minimize the distribution distance\nbetween the model's predictions and the target labels in a differentiable\nmanner, effectively integrating distribution information into model training.\nDist Loss enables deep learning models to regularize their output distribution\nduring training, effectively enhancing their focus on few-shot regions. We have\nconducted extensive experiments across three datasets spanning computer vision\nand healthcare: IMDB-WIKI-DIR, AgeDB-DIR, and ECG-Ka-DIR. The results\ndemonstrate that Dist Loss effectively mitigates the negative impact of\nimbalanced data distribution on model performance, achieving state-of-the-art\nresults in sparse data regions. Furthermore, Dist Loss is easy to integrate,\ncomplementing existing methods.\n","authors":["Guangkun Nie","Gongzheng Tang","Shenda Hong"],"pdf_url":"https://arxiv.org/pdf/2411.15216v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.14897v2","updated":"2025-03-02T10:18:09Z","published":"2025-02-17T21:35:18Z","title":"Market-Derived Financial Sentiment Analysis: Context-Aware Language\n Models for Crypto Forecasting","summary":" Financial Sentiment Analysis (FSA) traditionally relies on human-annotated\nsentiment labels to infer investor sentiment and forecast market movements.\nHowever, inferring the potential market impact of words based on their\nhuman-perceived intentions is inherently challenging. We hypothesize that the\nhistorical market reactions to words, offer a more reliable indicator of their\npotential impact on markets than subjective sentiment interpretations by human\nannotators. To test this hypothesis, a market-derived labeling approach is\nproposed to assign tweet labels based on ensuing short-term price trends,\nenabling the language model to capture the relationship between textual signals\nand market dynamics directly. A domain-specific language model was fine-tuned\non these labels, achieving up to an 11% improvement in short-term trend\nprediction accuracy over traditional sentiment-based benchmarks. Moreover, by\nincorporating market and temporal context through prompt-tuning, the proposed\ncontext-aware language model demonstrated an accuracy of 89.6% on a curated\ndataset of 227 impactful Bitcoin-related news events with significant market\nimpacts. Aggregating daily tweet predictions into trading signals, our method\noutperformed traditional fusion models (which combine sentiment-based and\nprice-based predictions). It challenged the assumption that sentiment-based\nsignals are inferior to price-based predictions in forecasting market\nmovements. Backtesting these signals across three distinct market regimes\nyielded robust Sharpe ratios of up to 5.07 in trending markets and 3.73 in\nneutral markets. Our findings demonstrate that language models can serve as\neffective short-term market predictors. This paradigm shift underscores the\nuntapped capabilities of language models in financial decision-making and opens\nnew avenues for market prediction applications.\n","authors":["Hamid Moradi-Kamali","Mohammad-Hossein Rajabi-Ghozlou","Mahdi Ghazavi","Ali Soltani","Amirreza Sattarzadeh","Reza Entezari-Maleki"],"pdf_url":"https://arxiv.org/pdf/2502.14897v2.pdf","comment":"13 pages, 6 figures"}],"Multimedia":[{"id":"http://arxiv.org/abs/2410.04810v2","updated":"2025-03-02T17:18:04Z","published":"2024-10-07T07:45:18Z","title":"FedBiP: Heterogeneous One-Shot Federated Learning with Personalized\n Latent Diffusion Models","summary":" One-Shot Federated Learning (OSFL), a special decentralized machine learning\nparadigm, has recently gained significant attention. OSFL requires only a\nsingle round of client data or model upload, which reduces communication costs\nand mitigates privacy threats compared to traditional FL. Despite these\npromising prospects, existing methods face challenges due to client data\nheterogeneity and limited data quantity when applied to real-world OSFL\nsystems. Recently, Latent Diffusion Models (LDM) have shown remarkable\nadvancements in synthesizing high-quality images through pretraining on\nlarge-scale datasets, thereby presenting a potential solution to overcome these\nissues. However, directly applying pretrained LDM to heterogeneous OSFL results\nin significant distribution shifts in synthetic data, leading to performance\ndegradation in classification models trained on such data. This issue is\nparticularly pronounced in rare domains, such as medical imaging, which are\nunderrepresented in LDM's pretraining data. To address this challenge, we\npropose Federated Bi-Level Personalization (FedBiP), which personalizes the\npretrained LDM at both instance-level and concept-level. Hereby, FedBiP\nsynthesizes images following the client's local data distribution without\ncompromising the privacy regulations. FedBiP is also the first approach to\nsimultaneously address feature space heterogeneity and client data scarcity in\nOSFL. Our method is validated through extensive experiments on three OSFL\nbenchmarks with feature space heterogeneity, as well as on challenging medical\nand satellite image datasets with label heterogeneity. The results demonstrate\nthe effectiveness of FedBiP, which substantially outperforms other OSFL\nmethods.\n","authors":["Haokun Chen","Hang Li","Yao Zhang","Jinhe Bi","Gengyuan Zhang","Yueqi Zhang","Philip Torr","Jindong Gu","Denis Krompass","Volker Tresp"],"pdf_url":"https://arxiv.org/pdf/2410.04810v2.pdf","comment":"CVPR 2025"},{"id":"http://arxiv.org/abs/2408.11915v2","updated":"2025-03-02T15:55:14Z","published":"2024-08-21T18:06:15Z","title":"Video-Foley: Two-Stage Video-To-Sound Generation via Temporal Event\n Condition For Foley Sound","summary":" Foley sound synthesis is crucial for multimedia production, enhancing user\nexperience by synchronizing audio and video both temporally and semantically.\nRecent studies on automating this labor-intensive process through\nvideo-to-sound generation face significant challenges. Systems lacking explicit\ntemporal features suffer from poor alignment and controllability, while\ntimestamp-based models require costly and subjective human annotation. We\npropose Video-Foley, a video-to-sound system using Root Mean Square (RMS) as an\nintuitive condition with semantic timbre prompts (audio or text). RMS, a\nframe-level intensity envelope closely related to audio semantics, acts as a\ntemporal event feature to guide audio generation from video. The\nannotation-free self-supervised learning framework consists of two stages,\nVideo2RMS and RMS2Sound, incorporating novel ideas including RMS discretization\nand RMS-ControlNet with a pretrained text-to-audio model. Our extensive\nevaluation shows that Video-Foley achieves state-of-the-art performance in\naudio-visual alignment and controllability for sound timing, intensity, timbre,\nand nuance. Source code, model weights and demos are available on our companion\nwebsite. (https://jnwnlee.github.io/video-foley-demo)\n","authors":["Junwon Lee","Jaekwon Im","Dabin Kim","Juhan Nam"],"pdf_url":"https://arxiv.org/pdf/2408.11915v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18709v4","updated":"2025-03-02T15:37:39Z","published":"2023-10-28T13:37:52Z","title":"Audio-Visual Instance Segmentation","summary":" In this paper, we propose a new multi-modal task, termed audio-visual\ninstance segmentation (AVIS), which aims to simultaneously identify, segment\nand track individual sounding object instances in audible videos. To facilitate\nthis research, we introduce a high-quality benchmark named AVISeg, containing\nover 90K instance masks from 26 semantic categories in 926 long videos.\nAdditionally, we propose a strong baseline model for this task. Our model first\nlocalizes sound source within each frame, and condenses object-specific\ncontexts into concise tokens. Then it builds long-range audio-visual\ndependencies between these tokens using window-based attention, and tracks\nsounding objects among the entire video sequences. Extensive experiments reveal\nthat our method performs best on AVISeg, surpassing the existing methods from\nrelated tasks. We further conduct the evaluation on several multi-modal large\nmodels. Unfortunately, they exhibits subpar performance on instance-level sound\nsource localization and temporal perception. We expect that AVIS will inspire\nthe community towards a more comprehensive multi-modal understanding. Dataset\nand code is available at https://github.com/ruohaoguo/avis.\n","authors":["Ruohao Guo","Xianghua Ying","Yaru Chen","Dantong Niu","Guangyao Li","Liao Qu","Yanyu Qi","Jinxing Zhou","Bowei Xing","Wenzhen Yue","Ji Shi","Qixun Wang","Peiliang Zhang","Buwen Liang"],"pdf_url":"https://arxiv.org/pdf/2310.18709v4.pdf","comment":"Accepted by CVPR 2025"},{"id":"http://arxiv.org/abs/2410.11817v2","updated":"2025-03-02T07:05:19Z","published":"2024-10-15T17:46:31Z","title":"Improving Long-Text Alignment for Text-to-Image Diffusion Models","summary":" The rapid advancement of text-to-image (T2I) diffusion models has enabled\nthem to generate unprecedented results from given texts. However, as text\ninputs become longer, existing encoding methods like CLIP face limitations, and\naligning the generated images with long texts becomes challenging. To tackle\nthese issues, we propose LongAlign, which includes a segment-level encoding\nmethod for processing long texts and a decomposed preference optimization\nmethod for effective alignment training. For segment-level encoding, long texts\nare divided into multiple segments and processed separately. This method\novercomes the maximum input length limits of pretrained encoding models. For\npreference optimization, we provide decomposed CLIP-based preference models to\nfine-tune diffusion models. Specifically, to utilize CLIP-based preference\nmodels for T2I alignment, we delve into their scoring mechanisms and find that\nthe preference scores can be decomposed into two components: a text-relevant\npart that measures T2I alignment and a text-irrelevant part that assesses other\nvisual aspects of human preference. Additionally, we find that the\ntext-irrelevant part contributes to a common overfitting problem during\nfine-tuning. To address this, we propose a reweighting strategy that assigns\ndifferent weights to these two components, thereby reducing overfitting and\nenhancing alignment. After fine-tuning $512 \\times 512$ Stable Diffusion (SD)\nv1.5 for about 20 hours using our method, the fine-tuned SD outperforms\nstronger foundation models in T2I alignment, such as PixArt-$\\alpha$ and\nKandinsky v2.2. The code is available at\nhttps://github.com/luping-liu/LongAlign.\n","authors":["Luping Liu","Chao Du","Tianyu Pang","Zehan Wang","Chongxuan Li","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2410.11817v2.pdf","comment":null}]},"2025-03-01T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2303.15263v5","updated":"2025-03-01T23:43:08Z","published":"2023-03-27T14:52:08Z","title":"Joint Person Identity, Gender and Age Estimation from Hand Images using\n Deep Multi-Task Representation Learning","summary":" In this paper, we propose a multi-task representation learning framework to\njointly estimate the identity, gender and age of individuals from their hand\nimages for the purpose of criminal investigations since the hand images are\noften the only available information in cases of serious crime such as sexual\nabuse. We investigate different up-to-date deep learning architectures and\ncompare their performance for joint estimation of identity, gender and age from\nhand images of perpetrators of serious crime. To simplify the age prediction,\nwe create age groups for the age estimation. We make extensive evaluations and\ncomparisons of both convolution-based and transformer-based deep learning\narchitectures on a publicly available 11k hands dataset. Our experimental\nanalysis shows that it is possible to efficiently estimate not only identity\nbut also other attributes such as gender and age of suspects jointly from hand\nimages for criminal investigations, which is crucial in assisting international\npolice forces in the court to identify and convict abusers.\n","authors":["Nathanael L. Baisa"],"pdf_url":"https://arxiv.org/pdf/2303.15263v5.pdf","comment":"arXiv admin note: text overlap with arXiv:2209.04821"},{"id":"http://arxiv.org/abs/2404.15709v3","updated":"2025-03-01T23:26:22Z","published":"2024-04-24T07:58:28Z","title":"ViViDex: Learning Vision-based Dexterous Manipulation from Human Videos","summary":" In this work, we aim to learn a unified vision-based policy for\nmulti-fingered robot hands to manipulate a variety of objects in diverse poses.\nThough prior work has shown benefits of using human videos for policy learning,\nperformance gains have been limited by the noise in estimated trajectories.\nMoreover, reliance on privileged object information such as ground-truth object\nstates further limits the applicability in realistic scenarios. To address\nthese limitations, we propose a new framework ViViDex to improve vision-based\npolicy learning from human videos. It first uses reinforcement learning with\ntrajectory guided rewards to train state-based policies for each video,\nobtaining both visually natural and physically plausible trajectories from the\nvideo. We then rollout successful episodes from state-based policies and train\na unified visual policy without using any privileged information. We propose\ncoordinate transformation to further enhance the visual point cloud\nrepresentation, and compare behavior cloning and diffusion policy for the\nvisual policy training. Experiments both in simulation and on the real robot\ndemonstrate that ViViDex outperforms state-of-the-art approaches on three\ndexterous manipulation tasks.\n","authors":["Zerui Chen","Shizhe Chen","Etienne Arlaud","Ivan Laptev","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2404.15709v3.pdf","comment":"Accepted by ICRA 2025. Project Page:\n https://zerchen.github.io/projects/vividex.html"},{"id":"http://arxiv.org/abs/2502.20108v2","updated":"2025-03-01T23:17:26Z","published":"2025-02-27T14:02:14Z","title":"VDT-Auto: End-to-end Autonomous Driving with VLM-Guided Diffusion\n Transformers","summary":" In autonomous driving, dynamic environment and corner cases pose significant\nchallenges to the robustness of ego vehicle's decision-making. To address these\nchallenges, commencing with the representation of state-action mapping in the\nend-to-end autonomous driving paradigm, we introduce a novel pipeline,\nVDT-Auto. Leveraging the advancement of the state understanding of Visual\nLanguage Model (VLM), incorporating with diffusion Transformer-based action\ngeneration, our VDT-Auto parses the environment geometrically and contextually\nfor the conditioning of the diffusion process. Geometrically, we use a\nbird's-eye view (BEV) encoder to extract feature grids from the surrounding\nimages. Contextually, the structured output of our fine-tuned VLM is processed\ninto textual embeddings and noisy paths. During our diffusion process, the\nadded noise for the forward process is sampled from the noisy path output of\nthe fine-tuned VLM, while the extracted BEV feature grids and embedded texts\ncondition the reverse process of our diffusion Transformers. Our VDT-Auto\nachieved 0.52m on average L2 errors and 21% on average collision rate in the\nnuScenes open-loop planning evaluation. Moreover, the real-world demonstration\nexhibited prominent generalizability of our VDT-Auto. The code and dataset will\nbe released after acceptance.\n","authors":["Ziang Guo","Konstantin Gubernatorov","Selamawit Asfaw","Zakhar Yagudin","Dzmitry Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2502.20108v2.pdf","comment":"Submitted paper"},{"id":"http://arxiv.org/abs/2407.07516v2","updated":"2025-03-01T23:17:11Z","published":"2024-07-10T10:09:12Z","title":"HDKD: Hybrid Data-Efficient Knowledge Distillation Network for Medical\n Image Classification","summary":" Vision Transformers (ViTs) have achieved significant advancement in computer\nvision tasks due to their powerful modeling capacity. However, their\nperformance notably degrades when trained with insufficient data due to lack of\ninherent inductive biases. Distilling knowledge and inductive biases from a\nConvolutional Neural Network (CNN) teacher has emerged as an effective strategy\nfor enhancing the generalization of ViTs on limited datasets. Previous\napproaches to Knowledge Distillation (KD) have pursued two primary paths: some\nfocused solely on distilling the logit distribution from CNN teacher to ViT\nstudent, neglecting the rich semantic information present in intermediate\nfeatures due to the structural differences between them. Others integrated\nfeature distillation along with logit distillation, yet this introduced\nalignment operations that limits the amount of knowledge transferred due to\nmismatched architectures and increased the computational overhead. To this end,\nthis paper presents Hybrid Data-efficient Knowledge Distillation (HDKD)\nparadigm which employs a CNN teacher and a hybrid student. The choice of hybrid\nstudent serves two main aspects. First, it leverages the strengths of both\nconvolutions and transformers while sharing the convolutional structure with\nthe teacher model. Second, this shared structure enables the direct application\nof feature distillation without any information loss or additional\ncomputational overhead. Additionally, we propose an efficient light-weight\nconvolutional block named Mobile Channel-Spatial Attention (MBCSA), which\nserves as the primary convolutional block in both teacher and student models.\nExtensive experiments on two medical public datasets showcase the superiority\nof HDKD over other state-of-the-art models and its computational efficiency.\nSource code at: https://github.com/omarsherif200/HDKD\n","authors":["Omar S. EL-Assiouti","Ghada Hamed","Dina Khattab","Hala M. Ebied"],"pdf_url":"https://arxiv.org/pdf/2407.07516v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16820v3","updated":"2025-03-01T22:41:18Z","published":"2024-04-25T17:58:43Z","title":"Revisiting Text-to-Image Evaluation with Gecko: On Metrics, Prompts, and\n Human Ratings","summary":" While text-to-image (T2I) generative models have become ubiquitous, they do\nnot necessarily generate images that align with a given prompt. While previous\nwork has evaluated T2I alignment by proposing metrics, benchmarks, and\ntemplates for collecting human judgements, the quality of these components is\nnot systematically measured. Human-rated prompt sets are generally small and\nthe reliability of the ratings -- and thereby the prompt set used to compare\nmodels -- is not evaluated. We address this gap by performing an extensive\nstudy evaluating auto-eval metrics and human templates. We provide three main\ncontributions: (1) We introduce a comprehensive skills-based benchmark that can\ndiscriminate models across different human templates. This skills-based\nbenchmark categorises prompts into sub-skills, allowing a practitioner to\npinpoint not only which skills are challenging, but at what level of complexity\na skill becomes challenging. (2) We gather human ratings across four templates\nand four T2I models for a total of >100K annotations. This allows us to\nunderstand where differences arise due to inherent ambiguity in the prompt and\nwhere they arise due to differences in metric and model quality. (3) Finally,\nwe introduce a new QA-based auto-eval metric that is better correlated with\nhuman ratings than existing metrics for our new dataset, across different human\ntemplates, and on TIFA160.\n","authors":["Olivia Wiles","Chuhan Zhang","Isabela Albuquerque","Ivana Kajić","Su Wang","Emanuele Bugliarello","Yasumasa Onoe","Chris Knutsen","Cyrus Rashtchian","Jordi Pont-Tuset","Aida Nematzadeh"],"pdf_url":"https://arxiv.org/pdf/2404.16820v3.pdf","comment":"Accepted to ICLR 2025 (Spotlight)"},{"id":"http://arxiv.org/abs/2312.04465v3","updated":"2025-03-01T22:24:56Z","published":"2023-12-07T17:35:49Z","title":"FitDiff: Robust monocular 3D facial shape and reflectance estimation\n using Diffusion Models","summary":" The remarkable progress in 3D face reconstruction has resulted in high-detail\nand photorealistic facial representations. Recently, Diffusion Models have\nrevolutionized the capabilities of generative methods by surpassing the\nperformance of GANs. In this work, we present FitDiff, a diffusion-based 3D\nfacial avatar generative model. Leveraging diffusion principles, our model\naccurately generates relightable facial avatars, utilizing an identity\nembedding extracted from an \"in-the-wild\" 2D facial image. The introduced\nmulti-modal diffusion model is the first to concurrently output facial\nreflectance maps (diffuse and specular albedo and normals) and shapes,\nshowcasing great generalization capabilities. It is solely trained on an\nannotated subset of a public facial dataset, paired with 3D reconstructions. We\nrevisit the typical 3D facial fitting approach by guiding a reverse diffusion\nprocess using perceptual and face recognition losses. Being the first 3D LDM\nconditioned on face recognition embeddings, FitDiff reconstructs relightable\nhuman avatars, that can be used as-is in common rendering engines, starting\nonly from an unconstrained facial image, and achieving state-of-the-art\nperformance.\n","authors":["Stathis Galanakis","Alexandros Lattas","Stylianos Moschoglou","Stefanos Zafeiriou"],"pdf_url":"https://arxiv.org/pdf/2312.04465v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.01345v2","updated":"2025-03-01T22:11:10Z","published":"2024-10-02T09:02:34Z","title":"Towards Generalizable Vision-Language Robotic Manipulation: A Benchmark\n and LLM-guided 3D Policy","summary":" Generalizing language-conditioned robotic policies to new tasks remains a\nsignificant challenge, hampered by the lack of suitable simulation benchmarks.\nIn this paper, we address this gap by introducing GemBench, a novel benchmark\nto assess generalization capabilities of vision-language robotic manipulation\npolicies. GemBench incorporates seven general action primitives and four levels\nof generalization, spanning novel placements, rigid and articulated objects,\nand complex long-horizon tasks. We evaluate state-of-the-art approaches on\nGemBench and also introduce a new method. Our approach 3D-LOTUS leverages rich\n3D information for action prediction conditioned on language. While 3D-LOTUS\nexcels in both efficiency and performance on seen tasks, it struggles with\nnovel tasks. To address this, we present 3D-LOTUS++, a framework that\nintegrates 3D-LOTUS's motion planning capabilities with the task planning\ncapabilities of LLMs and the object grounding accuracy of VLMs. 3D-LOTUS++\nachieves state-of-the-art performance on novel tasks of GemBench, setting a new\nstandard for generalization in robotic manipulation. The benchmark, codes and\ntrained models are available at\nhttps://www.di.ens.fr/willow/research/gembench/.\n","authors":["Ricardo Garcia","Shizhe Chen","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2410.01345v2.pdf","comment":"ICRA 2025"},{"id":"http://arxiv.org/abs/2407.19617v2","updated":"2025-03-01T19:16:08Z","published":"2024-07-29T00:39:51Z","title":"Leveraging Vision Language Models for Specialized Agricultural Tasks","summary":" As Vision Language Models (VLMs) become increasingly accessible to farmers\nand agricultural experts, there is a growing need to evaluate their potential\nin specialized tasks. We present AgEval, a comprehensive benchmark for\nassessing VLMs' capabilities in plant stress phenotyping, offering a solution\nto the challenge of limited annotated data in agriculture. Our study explores\nhow general-purpose VLMs can be leveraged for domain-specific tasks with only a\nfew annotated examples, providing insights into their behavior and\nadaptability. AgEval encompasses 12 diverse plant stress phenotyping tasks,\nevaluating zero-shot and few-shot in-context learning performance of\nstate-of-the-art models including Claude, GPT, Gemini, and LLaVA. Our results\ndemonstrate VLMs' rapid adaptability to specialized tasks, with the\nbest-performing model showing an increase in F1 scores from 46.24% to 73.37% in\n8-shot identification. To quantify performance disparities across classes, we\nintroduce metrics such as the coefficient of variation (CV), revealing that\nVLMs' training impacts classes differently, with CV ranging from 26.02% to\n58.03%. We also find that strategic example selection enhances model\nreliability, with exact category examples improving F1 scores by 15.38% on\naverage. AgEval establishes a framework for assessing VLMs in agricultural\napplications, offering valuable benchmarks for future evaluations. Our findings\nsuggest that VLMs, with minimal few-shot examples, show promise as a viable\nalternative to traditional specialized models in plant stress phenotyping,\nwhile also highlighting areas for further refinement. Results and benchmark\ndetails are available at: https://github.com/arbab-ml/AgEval\n","authors":["Muhammad Arbab Arshad","Talukder Zaki Jubery","Tirtho Roy","Rim Nassiri","Asheesh K. Singh","Arti Singh","Chinmay Hegde","Baskar Ganapathysubramanian","Aditya Balu","Adarsh Krishnamurthy","Soumik Sarkar"],"pdf_url":"https://arxiv.org/pdf/2407.19617v2.pdf","comment":"Published at WACV 2025"},{"id":"http://arxiv.org/abs/2410.11019v2","updated":"2025-03-01T18:48:48Z","published":"2024-10-14T19:14:49Z","title":"ET-Former: Efficient Triplane Deformable Attention for 3D Semantic Scene\n Completion From Monocular Camera","summary":" We introduce ET-Former, a novel end-to-end algorithm for semantic scene\ncompletion using a single monocular camera. Our approach generates a semantic\noccupancy map from single RGB observation while simultaneously providing\nuncertainty estimates for semantic predictions. By designing a triplane-based\ndeformable attention mechanism, our approach improves geometric understanding\nof the scene than other SOTA approaches and reduces noise in semantic\npredictions. Additionally, through the use of a Conditional Variational\nAutoEncoder (CVAE), we estimate the uncertainties of these predictions. The\ngenerated semantic and uncertainty maps will help formulate navigation\nstrategies that facilitate safe and permissible decision making in the future.\nEvaluated on the Semantic-KITTI dataset, ET-Former achieves the highest\nIntersection over Union (IoU) and mean IoU (mIoU) scores while maintaining the\nlowest GPU memory usage, surpassing state-of-the-art (SOTA) methods. It\nimproves the SOTA scores of IoU from 44.71 to 51.49 and mIoU from 15.04 to\n16.30 on SeamnticKITTI test, with a notably low training memory consumption of\n10.9 GB. Project page: https://github.com/jingGM/ET-Former.git.\n","authors":["Jing Liang","He Yin","Xuewei Qi","Jong Jin Park","Min Sun","Rajasimman Madhivanan","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2410.11019v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.17436v2","updated":"2025-03-01T18:48:22Z","published":"2025-02-24T18:59:55Z","title":"Towards Hierarchical Rectified Flow","summary":" We formulate a hierarchical rectified flow to model data distributions. It\nhierarchically couples multiple ordinary differential equations (ODEs) and\ndefines a time-differentiable stochastic process that generates a data\ndistribution from a known source distribution. Each ODE resembles the ODE that\nis solved in a classic rectified flow, but differs in its domain, i.e.,\nlocation, velocity, acceleration, etc. Unlike the classic rectified flow\nformulation, which formulates a single ODE in the location domain and only\ncaptures the expected velocity field (sufficient to capture a multi-modal data\ndistribution), the hierarchical rectified flow formulation models the\nmulti-modal random velocity field, acceleration field, etc., in their entirety.\nThis more faithful modeling of the random velocity field enables integration\npaths to intersect when the underlying ODE is solved during data generation.\nIntersecting paths in turn lead to integration trajectories that are more\nstraight than those obtained in the classic rectified flow formulation, where\nintegration paths cannot intersect. This leads to modeling of data\ndistributions with fewer neural function evaluations. We empirically verify\nthis on synthetic 1D and 2D data as well as MNIST, CIFAR-10, and ImageNet-32\ndata. Our code is available at: https://riccizz.github.io/HRF/.\n","authors":["Yichi Zhang","Yici Yan","Alex Schwing","Zhizhen Zhao"],"pdf_url":"https://arxiv.org/pdf/2502.17436v2.pdf","comment":"ICLR 2025; Project Page: https://riccizz.github.io/HRF/"},{"id":"http://arxiv.org/abs/2409.19599v4","updated":"2025-03-01T17:31:31Z","published":"2024-09-29T07:32:14Z","title":"DATransNet: Dynamic Attention Transformer Network for Infrared Small\n Target Detection","summary":" Infrared small target detection (ISTD) is widely used in civilian and\nmilitary applications. However, ISTD encounters several challenges, including\nthe tendency for small and dim targets to be obscured by complex backgrounds.\nTo address this issue, we propose the Dynamic Attention Transformer Network\n(DATransNet), which aims to extract and preserve detailed information vital for\nsmall targets. DATransNet employs the Dynamic Attention Transformer (DATrans),\nsimulating central difference convolutions (CDC) to extract gradient features.\nFurthermore, we propose a global feature extraction module (GFEM) that offers a\ncomprehensive perspective to prevent the network from focusing solely on\ndetails while neglecting the global information. We compare the network with\nstate-of-the-art (SOTA) approaches and demonstrate that our method performs\neffectively. Our source code is available at\nhttps://github.com/greekinRoma/DATransNet.\n","authors":["Chen Hu","Yian Huang","Kexuan Li","Luping Zhang","Chang Long","Yiming Zhu","Tian Pu","Zhenming Peng"],"pdf_url":"https://arxiv.org/pdf/2409.19599v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09484v3","updated":"2025-03-01T17:29:09Z","published":"2024-11-14T14:37:50Z","title":"Image Matching Filtering and Refinement by Planes and Beyond","summary":" This paper introduces a modular, non-deep learning method for filtering and\nrefining sparse correspondences in image matching. Assuming that motion flow\nwithin the scene can be approximated by local homography transformations,\nmatches are aggregated into overlapping clusters corresponding to virtual\nplanes using an iterative RANSAC-based approach, with non-conforming\ncorrespondences discarded. Moreover, the underlying planar structural design\nprovides an explicit map between local patches associated with the matches,\nenabling optional refinement of keypoint positions through cross-correlation\ntemplate matching after patch reprojection. Finally, to enhance robustness and\nfault-tolerance against violations of the piece-wise planar approximation\nassumption, a further strategy is designed for minimizing relative patch\ndistortion in the plane reprojection by introducing an intermediate homography\nthat projects both patches into a common plane. The proposed method is\nextensively evaluated on standard datasets and image matching pipelines, and\ncompared with state-of-the-art approaches. Unlike other current comparisons,\nthe proposed benchmark also takes into account the more general, real, and\npractical cases where camera intrinsics are unavailable. Experimental results\ndemonstrate that our proposed non-deep learning, geometry-based approach\nachieves performances that are either superior to or on par with recent\nstate-of-the-art deep learning methods. Finally, this study suggests that there\nare still development potential in actual image matching solutions in the\nconsidered research direction, which could be in the future incorporated in\nnovel deep image matching architectures.\n","authors":["Fabio Bellavia","Zhenjun Zhao","Luca Morelli","Fabio Remondino"],"pdf_url":"https://arxiv.org/pdf/2411.09484v3.pdf","comment":"project page: https://github.com/fb82/MiHo"},{"id":"http://arxiv.org/abs/2410.08208v3","updated":"2025-03-01T15:51:38Z","published":"2024-10-10T17:59:51Z","title":"SPA: 3D Spatial-Awareness Enables Effective Embodied Representation","summary":" In this paper, we introduce SPA, a novel representation learning framework\nthat emphasizes the importance of 3D spatial awareness in embodied AI. Our\napproach leverages differentiable neural rendering on multi-view images to\nendow a vanilla Vision Transformer (ViT) with intrinsic spatial understanding.\nWe present the most comprehensive evaluation of embodied representation\nlearning to date, covering 268 tasks across 8 simulators with diverse policies\nin both single-task and language-conditioned multi-task scenarios. The results\nare compelling: SPA consistently outperforms more than 10 state-of-the-art\nrepresentation methods, including those specifically designed for embodied AI,\nvision-centric tasks, and multi-modal applications, while using less training\ndata. Furthermore, we conduct a series of real-world experiments to confirm its\neffectiveness in practical scenarios. These results highlight the critical role\nof 3D spatial awareness for embodied representation learning. Our strongest\nmodel takes more than 6000 GPU hours to train and we are committed to\nopen-sourcing all code and model weights to foster future research in embodied\nrepresentation learning. Project Page: https://haoyizhu.github.io/spa/.\n","authors":["Haoyi Zhu","Honghui Yang","Yating Wang","Jiange Yang","Limin Wang","Tong He"],"pdf_url":"https://arxiv.org/pdf/2410.08208v3.pdf","comment":"Project Page: https://haoyizhu.github.io/spa/"},{"id":"http://arxiv.org/abs/2501.03775v4","updated":"2025-03-01T15:41:19Z","published":"2025-01-07T13:30:54Z","title":"Strip R-CNN: Large Strip Convolution for Remote Sensing Object Detection","summary":" While witnessed with rapid development, remote sensing object detection\nremains challenging for detecting high aspect ratio objects. This paper shows\nthat large strip convolutions are good feature representation learners for\nremote sensing object detection and can detect objects of various aspect ratios\nwell. Based on large strip convolutions, we build a new network architecture\ncalled Strip R-CNN, which is simple, efficient, and powerful. Unlike recent\nremote sensing object detectors that leverage large-kernel convolutions with\nsquare shapes, our Strip R-CNN takes advantage of sequential orthogonal large\nstrip convolutions in our backbone network StripNet to capture spatial\ninformation. In addition, we improve the localization capability of\nremote-sensing object detectors by decoupling the detection heads and equipping\nthe localization branch with strip convolutions in our strip head. Extensive\nexperiments on several benchmarks, for example DOTA, FAIR1M, HRSC2016, and\nDIOR, show that our Strip R-CNN can greatly improve previous work. In\nparticular, our 30M model achieves 82.75% mAP on DOTA-v1.0, setting a new\nstate-of-the-art record. Our code will be made publicly available.Code is\navailable at https://github.com/YXB-NKU/Strip-R-CNN.\n","authors":["Xinbin Yuan","Zhaohui Zheng","Yuxuan Li","Xialei Liu","Li Liu","Xiang Li","Qibin Hou","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2501.03775v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.13524v2","updated":"2025-03-01T14:42:44Z","published":"2025-02-19T08:21:59Z","title":"MobileViM: A Light-weight and Dimension-independent Vision Mamba for 3D\n Medical Image Analysis","summary":" Efficient evaluation of three-dimensional (3D) medical images is crucial for\ndiagnostic and therapeutic practices in healthcare. Recent years have seen a\nsubstantial uptake in applying deep learning and computer vision to analyse and\ninterpret medical images. Traditional approaches, such as convolutional neural\nnetworks (CNNs) and vision transformers (ViTs), face significant computational\nchallenges, prompting the need for architectural advancements. Recent efforts\nhave led to the introduction of novel architectures like the ``Mamba'' model as\nalternative solutions to traditional CNNs or ViTs. The Mamba model excels in\nthe linear processing of one-dimensional data with low computational demands.\nHowever, Mamba's potential for 3D medical image analysis remains underexplored\nand could face significant computational challenges as the dimension increases.\nThis manuscript presents MobileViM, a streamlined architecture for efficient\nsegmentation of 3D medical images. In the MobileViM network, we invent a new\ndimension-independent mechanism and a dual-direction traversing approach to\nincorporate with a vision-Mamba-based framework. MobileViM also features a\ncross-scale bridging technique to improve efficiency and accuracy across\nvarious medical imaging modalities. With these enhancements, MobileViM achieves\nsegmentation speeds exceeding 90 frames per second (FPS) on a single graphics\nprocessing unit (i.e., NVIDIA RTX 4090). This performance is over 24 FPS faster\nthan the state-of-the-art deep learning models for processing 3D images with\nthe same computational resources. In addition, experimental evaluations\ndemonstrate that MobileViM delivers superior performance, with Dice similarity\nscores reaching 92.72%, 86.69%, 80.46%, and 77.43% for PENGWIN, BraTS2024,\nATLAS, and Toothfairy2 datasets, respectively, which significantly surpasses\nexisting models.\n","authors":["Wei Dai","Steven Wang","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2502.13524v2.pdf","comment":"The co-authors have not approved its submission to arXiv"},{"id":"http://arxiv.org/abs/2501.01791v2","updated":"2025-03-01T14:17:25Z","published":"2025-01-03T12:48:01Z","title":"Balancing Accuracy and Efficiency for Large-Scale SLAM: A Minimal Subset\n Approach for Scalable Loop Closures","summary":" Typical LiDAR SLAM architectures feature a front-end for odometry estimation\nand a back-end for refining and optimizing the trajectory and map, commonly\nthrough loop closures. However, loop closure detection in large-scale missions\npresents significant computational challenges due to the need to identify,\nverify, and process numerous candidate pairs for pose graph optimization.\nKeyframe sampling bridges the front-end and back-end by selecting frames for\nstoring and processing during global optimization. This article proposes an\nonline keyframe sampling approach that constructs the pose graph using the most\nimpactful keyframes for loop closure. We introduce the Minimal Subset Approach\n(MSA), which optimizes two key objectives: redundancy minimization and\ninformation preservation, implemented within a sliding window framework. By\noperating in the feature space rather than 3-D space, MSA efficiently reduces\nredundant keyframes while retaining essential information. In sum, evaluations\non diverse public datasets show that the proposed approach outperforms naive\nmethods in reducing false positive rates in place recognition, while delivering\nsuperior ATE and RPE in metric localization, without the need for manual\nparameter tuning. Additionally, MSA demonstrates efficiency and scalability by\nreducing memory usage and computational overhead during loop closure detection\nand pose graph optimization.\n","authors":["Nikolaos Stathoulopoulos","Christoforos Kanellakis","George Nikolakopoulos"],"pdf_url":"https://arxiv.org/pdf/2501.01791v2.pdf","comment":"8 pages, 7 Figures, 2 Tables. Submitted"},{"id":"http://arxiv.org/abs/2410.06912v2","updated":"2025-03-01T13:43:36Z","published":"2024-10-09T14:12:50Z","title":"Compositional Entailment Learning for Hyperbolic Vision-Language Models","summary":" Image-text representation learning forms a cornerstone in vision-language\nmodels, where pairs of images and textual descriptions are contrastively\naligned in a shared embedding space. Since visual and textual concepts are\nnaturally hierarchical, recent work has shown that hyperbolic space can serve\nas a high-potential manifold to learn vision-language representation with\nstrong downstream performance. In this work, for the first time we show how to\nfully leverage the innate hierarchical nature of hyperbolic embeddings by\nlooking beyond individual image-text pairs. We propose Compositional Entailment\nLearning for hyperbolic vision-language models. The idea is that an image is\nnot only described by a sentence but is itself a composition of multiple object\nboxes, each with their own textual description. Such information can be\nobtained freely by extracting nouns from sentences and using openly available\nlocalized grounding models. We show how to hierarchically organize images,\nimage boxes, and their textual descriptions through contrastive and\nentailment-based objectives. Empirical evaluation on a hyperbolic\nvision-language model trained with millions of image-text pairs shows that the\nproposed compositional learning approach outperforms conventional Euclidean\nCLIP learning, as well as recent hyperbolic alternatives, with better zero-shot\nand retrieval generalization and clearly stronger hierarchical performance.\n","authors":["Avik Pal","Max van Spengler","Guido Maria D'Amely di Melendugno","Alessandro Flaborea","Fabio Galasso","Pascal Mettes"],"pdf_url":"https://arxiv.org/pdf/2410.06912v2.pdf","comment":"Accepted as oral paper at ICLR 2025"},{"id":"http://arxiv.org/abs/2412.09945v3","updated":"2025-03-01T13:24:41Z","published":"2024-12-13T08:10:47Z","title":"Going Beyond Feature Similarity: Effective Dataset distillation based on\n Class-aware Conditional Mutual Information","summary":" Dataset distillation (DD) aims to minimize the time and memory consumption\nneeded for training deep neural networks on large datasets, by creating a\nsmaller synthetic dataset that has similar performance to that of the full real\ndataset. However, current dataset distillation methods often result in\nsynthetic datasets that are excessively difficult for networks to learn from,\ndue to the compression of a substantial amount of information from the original\ndata through metrics measuring feature similarity, e,g., distribution matching\n(DM). In this work, we introduce conditional mutual information (CMI) to assess\nthe class-aware complexity of a dataset and propose a novel method by\nminimizing CMI. Specifically, we minimize the distillation loss while\nconstraining the class-aware complexity of the synthetic dataset by minimizing\nits empirical CMI from the feature space of pre-trained networks,\nsimultaneously. Conducting on a thorough set of experiments, we show that our\nmethod can serve as a general regularization method to existing DD methods and\nimprove the performance and training efficiency.\n","authors":["Xinhao Zhong","Bin Chen","Hao Fang","Xulin Gu","Shu-Tao Xia","En-Hui Yang"],"pdf_url":"https://arxiv.org/pdf/2412.09945v3.pdf","comment":"Accepted to ICLR 2025"}],"Multimedia":[{"id":"http://arxiv.org/abs/2503.00625v1","updated":"2025-03-01T21:28:12Z","published":"2025-03-01T21:28:12Z","title":"Perceptual Visual Quality Assessment: Principles, Methods, and Future\n Directions","summary":" As multimedia services such as video streaming, video conferencing, virtual\nreality (VR), and online gaming continue to expand, ensuring high perceptual\nvisual quality becomes a priority to maintain user satisfaction and\ncompetitiveness. However, multimedia content undergoes various distortions\nduring acquisition, compression, transmission, and storage, resulting in the\ndegradation of experienced quality. Thus, perceptual visual quality assessment\n(PVQA), which focuses on evaluating the quality of multimedia content based on\nhuman perception, is essential for optimizing user experiences in advanced\ncommunication systems. Several challenges are involved in the PVQA process,\nincluding diverse characteristics of multimedia content such as image, video,\nVR, point cloud, mesh, multimodality, etc., and complex distortion scenarios as\nwell as viewing conditions. In this paper, we first present an overview of PVQA\nprinciples and methods. This includes both subjective methods, where users\ndirectly rate their experiences, and objective methods, where algorithms\npredict human perception based on measurable factors such as bitrate, frame\nrate, and compression levels. Based on the basics of PVQA, quality predictors\nfor different multimedia data are then introduced. In addition to traditional\nimages and videos, immersive multimedia and generative artificial intelligence\n(GenAI) content are also discussed. Finally, the paper concludes with a\ndiscussion on the future directions of PVQA research.\n","authors":["Wei Zhou","Hadi Amirpour","Christian Timmerer","Guangtao Zhai","Patrick Le Callet","Alan C. Bovik"],"pdf_url":"https://arxiv.org/pdf/2503.00625v1.pdf","comment":"A tutorial and review"},{"id":"http://arxiv.org/abs/2503.00548v1","updated":"2025-03-01T16:31:02Z","published":"2025-03-01T16:31:02Z","title":"Unbiased Video Scene Graph Generation via Visual and Semantic Dual\n Debiasing","summary":" Video Scene Graph Generation (VidSGG) aims to capture dynamic relationships\namong entities by sequentially analyzing video frames and integrating visual\nand semantic information. However, VidSGG is challenged by significant biases\nthat skew predictions. To mitigate these biases, we propose a VIsual and\nSemantic Awareness (VISA) framework for unbiased VidSGG. VISA addresses visual\nbias through memory-enhanced temporal integration that enhances object\nrepresentations and concurrently reduces semantic bias by iteratively\nintegrating object features with comprehensive semantic information derived\nfrom triplet relationships. This visual-semantics dual debiasing approach\nresults in more unbiased representations of complex scene dynamics. Extensive\nexperiments demonstrate the effectiveness of our method, where VISA outperforms\nexisting unbiased VidSGG approaches by a substantial margin (e.g., +13.1%\nimprovement in mR@20 and mR@50 for the SGCLS task under Semi Constraint).\n","authors":["Yanjun Li","Zhaoyang Li","Honghui Chen","Lizhi Xu"],"pdf_url":"https://arxiv.org/pdf/2503.00548v1.pdf","comment":"17 pages, 8 figures, CVPR 2025"},{"id":"http://arxiv.org/abs/2503.00455v1","updated":"2025-03-01T11:35:17Z","published":"2025-03-01T11:35:17Z","title":"PodAgent: A Comprehensive Framework for Podcast Generation","summary":" Existing Existing automatic audio generation methods struggle to generate\npodcast-like audio programs effectively. The key challenges lie in in-depth\ncontent generation, appropriate and expressive voice production. This paper\nproposed PodAgent, a comprehensive framework for creating audio programs.\nPodAgent 1) generates informative topic-discussion content by designing a\nHost-Guest-Writer multi-agent collaboration system, 2) builds a voice pool for\nsuitable voice-role matching and 3) utilizes LLM-enhanced speech synthesis\nmethod to generate expressive conversational speech. Given the absence of\nstandardized evaluation criteria for podcast-like audio generation, we\ndeveloped comprehensive assessment guidelines to effectively evaluate the\nmodel's performance. Experimental results demonstrate PodAgent's effectiveness,\nsignificantly surpassing direct GPT-4 generation in topic-discussion dialogue\ncontent, achieving an 87.4% voice-matching accuracy, and producing more\nexpressive speech through LLM-guided synthesis. Demo page:\nhttps://podcast-agent.github.io/demo/. Source code:\nhttps://github.com/yujxx/PodAgent.\n","authors":["Yujia Xiao","Lei He","Haohan Guo","Fenglong Xie","Tan Lee"],"pdf_url":"https://arxiv.org/pdf/2503.00455v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.00374v1","updated":"2025-03-01T07:02:30Z","published":"2025-03-01T07:02:30Z","title":"MIRROR: Multi-Modal Pathological Self-Supervised Representation Learning\n via Modality Alignment and Retention","summary":" Histopathology and transcriptomics are fundamental modalities in oncology,\nencapsulating the morphological and molecular aspects of the disease.\nMulti-modal self-supervised learning has demonstrated remarkable potential in\nlearning pathological representations by integrating diverse data sources.\nConventional multi-modal integration methods primarily emphasize modality\nalignment, while paying insufficient attention to retaining the\nmodality-specific structures. However, unlike conventional scenarios where\nmulti-modal inputs share highly overlapping features, histopathology and\ntranscriptomics exhibit pronounced heterogeneity, offering orthogonal yet\ncomplementary insights. Histopathology provides morphological and spatial\ncontext, elucidating tissue architecture and cellular topology, whereas\ntranscriptomics delineates molecular signatures through gene expression\npatterns. This inherent disparity introduces a major challenge in aligning them\nwhile maintaining modality-specific fidelity. To address these challenges, we\npresent MIRROR, a novel multi-modal representation learning method designed to\nfoster both modality alignment and retention. MIRROR employs dedicated encoders\nto extract comprehensive features for each modality, which is further\ncomplemented by a modality alignment module to achieve seamless integration\nbetween phenotype patterns and molecular profiles. Furthermore, a modality\nretention module safeguards unique attributes from each modality, while a style\nclustering module mitigates redundancy and enhances disease-relevant\ninformation by modeling and aligning consistent pathological signatures within\na clustering space. Extensive evaluations on TCGA cohorts for cancer subtyping\nand survival analysis highlight MIRROR's superior performance, demonstrating\nits effectiveness in constructing comprehensive oncological feature\nrepresentations and benefiting the cancer diagnosis.\n","authors":["Tianyi Wang","Jianan Fan","Dingxin Zhang","Dongnan Liu","Yong Xia","Heng Huang","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2503.00374v1.pdf","comment":"10 pages, 5 figures, 3 tables"}]},"2025-03-04T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2502.20041v3","updated":"2025-03-04T07:37:57Z","published":"2025-02-27T12:29:44Z","title":"3D-AffordanceLLM: Harnessing Large Language Models for Open-Vocabulary\n Affordance Detection in 3D Worlds","summary":" 3D Affordance detection is a challenging problem with broad applications on\nvarious robotic tasks. Existing methods typically formulate the detection\nparadigm as a label-based semantic segmentation task. This paradigm relies on\npredefined labels and lacks the ability to comprehend complex natural language,\nresulting in limited generalization in open-world scene. To address these\nlimitations, we reformulate the traditional affordance detection paradigm into\n\\textit{Instruction Reasoning Affordance Segmentation} (IRAS) task. This task\nis designed to output a affordance mask region given a query reasoning text,\nwhich avoids fixed categories of input labels. We accordingly propose the\n\\textit{3D-AffordanceLLM} (3D-ADLLM), a framework designed for reasoning\naffordance detection in 3D open-scene. Specifically, 3D-ADLLM introduces large\nlanguage models (LLMs) to 3D affordance perception with a custom-designed\ndecoder for generating affordance masks, thus achieving open-world reasoning\naffordance detection. In addition, given the scarcity of 3D affordance datasets\nfor training large models, we seek to extract knowledge from general\nsegmentation data and transfer it to affordance detection. Thus, we propose a\nmulti-stage training strategy that begins with a novel pre-training task, i.e.,\n\\textit{Referring Object Part Segmentation}~(ROPS). This stage is designed to\nequip the model with general recognition and segmentation capabilities at the\nobject-part level. Then followed by fine-tuning with the IRAS task, 3D-ADLLM\nobtains the reasoning ability for affordance detection. In summary, 3D-ADLLM\nleverages the rich world knowledge and human-object interaction reasoning\nability of LLMs, achieving approximately an 8\\% improvement in mIoU on\nopen-vocabulary affordance detection tasks.\n","authors":["Hengshuo Chu","Xiang Deng","Qi Lv","Xiaoyang Chen","Yinchuan Li","Jianye Hao","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2502.20041v3.pdf","comment":"ICLR"},{"id":"http://arxiv.org/abs/2405.14093v4","updated":"2025-03-04T08:24:20Z","published":"2024-05-23T01:43:54Z","title":"A Survey on Vision-Language-Action Models for Embodied AI","summary":" Embodied AI is widely recognized as a key element of artificial general\nintelligence because it involves controlling embodied agents to perform tasks\nin the physical world. Building on the success of large language models and\nvision-language models, a new category of multimodal models -- referred to as\nvision-language-action models (VLAs) -- has emerged to address\nlanguage-conditioned robotic tasks in embodied AI by leveraging their distinct\nability to generate actions. In recent years, a myriad of VLAs have been\ndeveloped, making it imperative to capture the rapidly evolving landscape\nthrough a comprehensive survey. To this end, we present the first survey on\nVLAs for embodied AI. This work provides a detailed taxonomy of VLAs, organized\ninto three major lines of research. The first line focuses on individual\ncomponents of VLAs. The second line is dedicated to developing control policies\nadept at predicting low-level actions. The third line comprises high-level task\nplanners capable of decomposing long-horizon tasks into a sequence of subtasks,\nthereby guiding VLAs to follow more general user instructions. Furthermore, we\nprovide an extensive summary of relevant resources, including datasets,\nsimulators, and benchmarks. Finally, we discuss the challenges faced by VLAs\nand outline promising future directions in embodied AI. We have created a\nproject associated with this survey, which is available at\nhttps://github.com/yueen-ma/Awesome-VLA.\n","authors":["Yueen Ma","Zixing Song","Yuzheng Zhuang","Jianye Hao","Irwin King"],"pdf_url":"https://arxiv.org/pdf/2405.14093v4.pdf","comment":"Project page: https://github.com/yueen-ma/Awesome-VLA"},{"id":"http://arxiv.org/abs/2502.20092v3","updated":"2025-03-04T14:00:03Z","published":"2025-02-27T13:51:56Z","title":"WalnutData: A UAV Remote Sensing Dataset of Green Walnuts and Model\n Evaluation","summary":" The UAV technology is gradually maturing and can provide extremely powerful\nsupport for smart agriculture and precise monitoring. Currently, there is no\ndataset related to green walnuts in the field of agricultural computer vision.\nThus, in order to promote the algorithm design in the field of agricultural\ncomputer vision, we used UAV to collect remote-sensing data from 8 walnut\nsample plots. Considering that green walnuts are subject to various lighting\nconditions and occlusion, we constructed a large-scale dataset with a\nhigher-granularity of target features - WalnutData. This dataset contains a\ntotal of 30,240 images and 706,208 instances, and there are 4 target\ncategories: being illuminated by frontal light and unoccluded (A1), being\nbacklit and unoccluded (A2), being illuminated by frontal light and occluded\n(B1), and being backlit and occluded (B2). Subsequently, we evaluated many\nmainstream algorithms on WalnutData and used these evaluation results as the\nbaseline standard. The dataset and all evaluation results can be obtained at\nhttps://github.com/1wuming/WalnutData.\n","authors":["Mingjie Wu","Chenggui Yang","Huihua Wang","Chen Xue","Yibo Wang","Haoyu Wang","Yansong Wang","Can Peng","Yuqi Han","Ruoyu Li","Lijun Yun","Zaiqing Chen","Yuelong Xia"],"pdf_url":"https://arxiv.org/pdf/2502.20092v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.06437v2","updated":"2025-03-04T23:49:01Z","published":"2024-10-09T00:45:02Z","title":"LocoVR: Multiuser Indoor Locomotion Dataset in Virtual Reality","summary":" Understanding human locomotion is crucial for AI agents such as robots,\nparticularly in complex indoor home environments. Modeling human trajectories\nin these spaces requires insight into how individuals maneuver around physical\nobstacles and manage social navigation dynamics. These dynamics include subtle\nbehaviors influenced by proxemics - the social use of space, such as stepping\naside to allow others to pass or choosing longer routes to avoid collisions.\nPrevious research has developed datasets of human motion in indoor scenes, but\nthese are often limited in scale and lack the nuanced social navigation\ndynamics common in home environments. To address this, we present LocoVR, a\ndataset of 7000+ two-person trajectories captured in virtual reality from over\n130 different indoor home environments. LocoVR provides accurate trajectory\ndata and precise spatial information, along with rich examples of\nsocially-motivated movement behaviors. For example, the dataset captures\ninstances of individuals navigating around each other in narrow spaces,\nadjusting paths to respect personal boundaries in living areas, and\ncoordinating movements in high-traffic zones like entryways and kitchens. Our\nevaluation shows that LocoVR significantly enhances model performance in three\npractical indoor tasks utilizing human trajectories, and demonstrates\npredicting socially-aware navigation patterns in home environments.\n","authors":["Kojiro Takeyama","Yimeng Liu","Misha Sra"],"pdf_url":"https://arxiv.org/pdf/2410.06437v2.pdf","comment":"This paper has been accepted to ICLR2025"}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2407.06057v3","updated":"2025-03-04T14:33:50Z","published":"2024-07-08T15:59:44Z","title":"Variational Best-of-N Alignment","summary":" Best-of-N (BoN) is a popular and effective algorithm for aligning language\nmodels to human preferences. The algorithm works as follows: at inference time,\nN samples are drawn from the language model, and the sample with the highest\nreward, as judged by a reward model, is returned as the output. Despite its\neffectiveness, BoN is computationally expensive; it reduces sampling throughput\nby a factor of N. To make BoN more efficient at inference time, one strategy is\nto fine-tune the language model to mimic what BoN does during inference. To\nachieve this, we derive the distribution induced by the BoN algorithm. We then\npropose to fine-tune the language model to minimize backward KL divergence to\nthe BoN distribution. Our approach is analogous to mean-field variational\ninference and, thus, we term it variational BoN (vBoN). To the extent this\nfine-tuning is successful and we end up with a good approximation, we have\nreduced the inference cost by a factor of N. Our experiments on controlled\ngeneration and summarization tasks show that BoN is the most effective\nalignment method, and our variational approximation to BoN achieves the closest\nperformance to BoN and surpasses models fine-tuned using the standard\nKL-constrained RL objective. In the controlled generation task, vBoN appears\nmore frequently on the Pareto frontier of reward and KL divergence compared to\nother alignment methods. In the summarization task, vBoN achieves high reward\nvalues across various sampling temperatures.\n","authors":["Afra Amini","Tim Vieira","Elliott Ash","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2407.06057v3.pdf","comment":"Accepted at ICLR 2025"},{"id":"http://arxiv.org/abs/2502.20092v3","updated":"2025-03-04T14:00:03Z","published":"2025-02-27T13:51:56Z","title":"WalnutData: A UAV Remote Sensing Dataset of Green Walnuts and Model\n Evaluation","summary":" The UAV technology is gradually maturing and can provide extremely powerful\nsupport for smart agriculture and precise monitoring. Currently, there is no\ndataset related to green walnuts in the field of agricultural computer vision.\nThus, in order to promote the algorithm design in the field of agricultural\ncomputer vision, we used UAV to collect remote-sensing data from 8 walnut\nsample plots. Considering that green walnuts are subject to various lighting\nconditions and occlusion, we constructed a large-scale dataset with a\nhigher-granularity of target features - WalnutData. This dataset contains a\ntotal of 30,240 images and 706,208 instances, and there are 4 target\ncategories: being illuminated by frontal light and unoccluded (A1), being\nbacklit and unoccluded (A2), being illuminated by frontal light and occluded\n(B1), and being backlit and occluded (B2). Subsequently, we evaluated many\nmainstream algorithms on WalnutData and used these evaluation results as the\nbaseline standard. The dataset and all evaluation results can be obtained at\nhttps://github.com/1wuming/WalnutData.\n","authors":["Mingjie Wu","Chenggui Yang","Huihua Wang","Chen Xue","Yibo Wang","Haoyu Wang","Yansong Wang","Can Peng","Yuqi Han","Ruoyu Li","Lijun Yun","Zaiqing Chen","Yuelong Xia"],"pdf_url":"https://arxiv.org/pdf/2502.20092v3.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2407.06057v3","updated":"2025-03-04T14:33:50Z","published":"2024-07-08T15:59:44Z","title":"Variational Best-of-N Alignment","summary":" Best-of-N (BoN) is a popular and effective algorithm for aligning language\nmodels to human preferences. The algorithm works as follows: at inference time,\nN samples are drawn from the language model, and the sample with the highest\nreward, as judged by a reward model, is returned as the output. Despite its\neffectiveness, BoN is computationally expensive; it reduces sampling throughput\nby a factor of N. To make BoN more efficient at inference time, one strategy is\nto fine-tune the language model to mimic what BoN does during inference. To\nachieve this, we derive the distribution induced by the BoN algorithm. We then\npropose to fine-tune the language model to minimize backward KL divergence to\nthe BoN distribution. Our approach is analogous to mean-field variational\ninference and, thus, we term it variational BoN (vBoN). To the extent this\nfine-tuning is successful and we end up with a good approximation, we have\nreduced the inference cost by a factor of N. Our experiments on controlled\ngeneration and summarization tasks show that BoN is the most effective\nalignment method, and our variational approximation to BoN achieves the closest\nperformance to BoN and surpasses models fine-tuned using the standard\nKL-constrained RL objective. In the controlled generation task, vBoN appears\nmore frequently on the Pareto frontier of reward and KL divergence compared to\nother alignment methods. In the summarization task, vBoN achieves high reward\nvalues across various sampling temperatures.\n","authors":["Afra Amini","Tim Vieira","Elliott Ash","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2407.06057v3.pdf","comment":"Accepted at ICLR 2025"},{"id":"http://arxiv.org/abs/2411.12556v3","updated":"2025-03-04T09:56:09Z","published":"2024-11-19T15:15:45Z","title":"UMGAD: Unsupervised Multiplex Graph Anomaly Detection","summary":" Graph anomaly detection (GAD) is a critical task in graph machine learning,\nwith the primary objective of identifying anomalous nodes that deviate\nsignificantly from the majority. This task is widely applied in various\nreal-world scenarios, including fraud detection and social network analysis.\nHowever, existing GAD methods still face two major challenges: (1) They are\noften limited to detecting anomalies in single-type interaction graphs and\nstruggle with multiple interaction types in multiplex heterogeneous graphs. (2)\nIn unsupervised scenarios, selecting appropriate anomaly score thresholds\nremains a significant challenge for accurate anomaly detection. To address the\nabove challenges, we propose a novel Unsupervised Multiplex Graph Anomaly\nDetection method, named UMGAD. We first learn multi-relational correlations\namong nodes in multiplex heterogeneous graphs and capture anomaly information\nduring node attribute and structure reconstruction through graph-masked\nautoencoder (GMAE). Then, to further extract abnormal information, we generate\nattribute-level and subgraph-level augmented-view graphs respectively, and\nperform attribute and structure reconstruction through GMAE. Finally, we learn\nto optimize node attributes and structural features through contrastive\nlearning between original-view and augmented-view graphs to improve the model's\nability to capture anomalies. Meanwhile, we also propose a new anomaly score\nthreshold selection strategy, which allows the model to be independent of\nground truth information in real unsupervised scenarios. Extensive experiments\non four datasets show that our UMGAD significantly outperforms state-of-the-art\nmethods, achieving average improvements of 13.48% in AUC and 11.68% in Macro-F1\nacross all datasets.\n","authors":["Xiang Li","Jianpeng Qi","Zhongying Zhao","Guanjie Zheng","Lei Cao","Junyu Dong","Yanwei Yu"],"pdf_url":"https://arxiv.org/pdf/2411.12556v3.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2502.18495v2","updated":"2025-03-04T15:16:52Z","published":"2025-02-19T01:37:24Z","title":"A Comprehensive Survey on Composed Image Retrieval","summary":" Composed Image Retrieval (CIR) is an emerging yet challenging task that\nallows users to search for target images using a multimodal query, comprising a\nreference image and a modification text specifying the user's desired changes\nto the reference image. Given its significant academic and practical value, CIR\nhas become a rapidly growing area of interest in the computer vision and\nmachine learning communities, particularly with the advances in deep learning.\nTo the best of our knowledge, there is currently no comprehensive review of CIR\nto provide a timely overview of this field. Therefore, we synthesize insights\nfrom over 120 publications in top conferences and journals, including ACM TOIS,\nSIGIR, and CVPR In particular, we systematically categorize existing supervised\nCIR and zero-shot CIR models using a fine-grained taxonomy. For a comprehensive\nreview, we also briefly discuss approaches for tasks closely related to CIR,\nsuch as attribute-based CIR and dialog-based CIR. Additionally, we summarize\nbenchmark datasets for evaluation and analyze existing supervised and zero-shot\nCIR methods by comparing experimental results across multiple datasets.\nFurthermore, we present promising future directions in this field, offering\npractical insights for researchers interested in further exploration. The\ncurated collection of related works is maintained and continuously updated in\nhttps://github.com/haokunwen/Awesome-Composed-Image-Retrieval.\n","authors":["Xuemeng Song","Haoqiang Lin","Haokun Wen","Bohan Hou","Mingzhu Xu","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2502.18495v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.02823v1","updated":"2025-03-04T17:48:48Z","published":"2025-03-04T17:48:48Z","title":"A Multimodal Symphony: Integrating Taste and Sound through Generative AI","summary":" In recent decades, neuroscientific and psychological research has traced\ndirect relationships between taste and auditory perceptions. This article\nexplores multimodal generative models capable of converting taste information\ninto music, building on this foundational research. We provide a brief review\nof the state of the art in this field, highlighting key findings and\nmethodologies. We present an experiment in which a fine-tuned version of a\ngenerative music model (MusicGEN) is used to generate music based on detailed\ntaste descriptions provided for each musical piece. The results are promising:\naccording the participants' ($n=111$) evaluation, the fine-tuned model produces\nmusic that more coherently reflects the input taste descriptions compared to\nthe non-fine-tuned model. This study represents a significant step towards\nunderstanding and developing embodied interactions between AI, sound, and\ntaste, opening new possibilities in the field of generative AI. We release our\ndataset, code and pre-trained model at: https://osf.io/xs5jy/.\n","authors":["Matteo Spanio","Massimiliano Zampini","Antonio Rodà","Franco Pierucci"],"pdf_url":"https://arxiv.org/pdf/2503.02823v1.pdf","comment":"17 pages, 6 figures (2 + 2 figures with 2 subfigures each)"},{"id":"http://arxiv.org/abs/2503.02452v1","updated":"2025-03-04T09:57:24Z","published":"2025-03-04T09:57:24Z","title":"2DGS-Avatar: Animatable High-fidelity Clothed Avatar via 2D Gaussian\n Splatting","summary":" Real-time rendering of high-fidelity and animatable avatars from monocular\nvideos remains a challenging problem in computer vision and graphics. Over the\npast few years, the Neural Radiance Field (NeRF) has made significant progress\nin rendering quality but behaves poorly in run-time performance due to the low\nefficiency of volumetric rendering. Recently, methods based on 3D Gaussian\nSplatting (3DGS) have shown great potential in fast training and real-time\nrendering. However, they still suffer from artifacts caused by inaccurate\ngeometry. To address these problems, we propose 2DGS-Avatar, a novel approach\nbased on 2D Gaussian Splatting (2DGS) for modeling animatable clothed avatars\nwith high-fidelity and fast training performance. Given monocular RGB videos as\ninput, our method generates an avatar that can be driven by poses and rendered\nin real-time. Compared to 3DGS-based methods, our 2DGS-Avatar retains the\nadvantages of fast training and rendering while also capturing detailed,\ndynamic, and photo-realistic appearances. We conduct abundant experiments on\npopular datasets such as AvatarRex and THuman4.0, demonstrating impressive\nperformance in both qualitative and quantitative metrics.\n","authors":["Qipeng Yan","Mingyang Sun","Lihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.02452v1.pdf","comment":"ICVRV 2024"},{"id":"http://arxiv.org/abs/2503.02318v1","updated":"2025-03-04T06:18:34Z","published":"2025-03-04T06:18:34Z","title":"Audio-Reasoner: Improving Reasoning Capability in Large Audio Language\n Models","summary":" Recent advancements in multimodal reasoning have largely overlooked the audio\nmodality. We introduce Audio-Reasoner, a large-scale audio language model for\ndeep reasoning in audio tasks. We meticulously curated a large-scale and\ndiverse multi-task audio dataset with simple annotations. Then, we leverage\nclosed-source models to conduct secondary labeling, QA generation, along with\nstructured COT process. These datasets together form a high-quality reasoning\ndataset with 1.2 million reasoning-rich samples, which we name CoTA. Following\ninference scaling principles, we train Audio-Reasoner on CoTA, enabling it to\nachieve great logical capabilities in audio reasoning. Experiments show\nstate-of-the-art performance across key benchmarks, including MMAU-mini\n(+25.42%), AIR-Bench chat/foundation(+14.57%/+10.13%), and MELD (+8.01%). Our\nfindings stress the core of structured CoT training in advancing audio\nreasoning.\n","authors":["Zhifei Xie","Mingbao Lin","Zihang Liu","Pengcheng Wu","Shuicheng Yan","Chunyan Miao"],"pdf_url":"https://arxiv.org/pdf/2503.02318v1.pdf","comment":"Technical report, in process"},{"id":"http://arxiv.org/abs/2310.07236v4","updated":"2025-03-04T03:47:04Z","published":"2023-10-11T06:56:08Z","title":"AdaMesh: Personalized Facial Expressions and Head Poses for Adaptive\n Speech-Driven 3D Facial Animation","summary":" Speech-driven 3D facial animation aims at generating facial movements that\nare synchronized with the driving speech, which has been widely explored\nrecently. Existing works mostly neglect the person-specific talking style in\ngeneration, including facial expression and head pose styles. Several works\nintend to capture the personalities by fine-tuning modules. However, limited\ntraining data leads to the lack of vividness. In this work, we propose AdaMesh,\na novel adaptive speech-driven facial animation approach, which learns the\npersonalized talking style from a reference video of about 10 seconds and\ngenerates vivid facial expressions and head poses. Specifically, we propose\nmixture-of-low-rank adaptation (MoLoRA) to fine-tune the expression adapter,\nwhich efficiently captures the facial expression style. For the personalized\npose style, we propose a pose adapter by building a discrete pose prior and\nretrieving the appropriate style embedding with a semantic-aware pose style\nmatrix without fine-tuning. Extensive experimental results show that our\napproach outperforms state-of-the-art methods, preserves the talking style in\nthe reference video, and generates vivid facial animation. The supplementary\nvideo and code will be available at https://adamesh.github.io.\n","authors":["Liyang Chen","Weihong Bao","Shun Lei","Boshi Tang","Zhiyong Wu","Shiyin Kang","Haozhi Huang","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2310.07236v4.pdf","comment":"Accepted by IEEE Transactions on Multimedia"},{"id":"http://arxiv.org/abs/2503.02199v1","updated":"2025-03-04T02:21:07Z","published":"2025-03-04T02:21:07Z","title":"Words or Vision: Do Vision-Language Models Have Blind Faith in Text?","summary":" Vision-Language Models (VLMs) excel in integrating visual and textual\ninformation for vision-centric tasks, but their handling of inconsistencies\nbetween modalities is underexplored. We investigate VLMs' modality preferences\nwhen faced with visual data and varied textual inputs in vision-centered\nsettings. By introducing textual variations to four vision-centric tasks and\nevaluating ten Vision-Language Models (VLMs), we discover a \\emph{``blind faith\nin text''} phenomenon: VLMs disproportionately trust textual data over visual\ndata when inconsistencies arise, leading to significant performance drops under\ncorrupted text and raising safety concerns. We analyze factors influencing this\ntext bias, including instruction prompts, language model size, text relevance,\ntoken order, and the interplay between visual and textual certainty. While\ncertain factors, such as scaling up the language model size, slightly mitigate\ntext bias, others like token order can exacerbate it due to positional biases\ninherited from language models. To address this issue, we explore supervised\nfine-tuning with text augmentation and demonstrate its effectiveness in\nreducing text bias. Additionally, we provide a theoretical analysis suggesting\nthat the blind faith in text phenomenon may stem from an imbalance of pure text\nand multi-modal data during training. Our findings highlight the need for\nbalanced training and careful consideration of modality interactions in VLMs to\nenhance their robustness and reliability in handling multi-modal data\ninconsistencies.\n","authors":["Ailin Deng","Tri Cao","Zhirui Chen","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2503.02199v1.pdf","comment":"Accepted to CVPR 2025"},{"id":"http://arxiv.org/abs/2412.17049v2","updated":"2025-03-04T02:14:35Z","published":"2024-12-22T15:00:16Z","title":"Modular Conversational Agents for Surveys and Interviews","summary":" Surveys and interviews are widely used for collecting insights on emerging or\nhypothetical scenarios. Traditional human-led methods often face challenges\nrelated to cost, scalability, and consistency. Recently, various domains have\nbegun to explore the use of conversational agents (chatbots) powered by\ngenerative artificial intelligence (AI) technologies. However, considering\ndecisions in transportation investments and policies often carry significant\npublic and environmental stakes, surveys and interviews face unique challenges\nin integrating AI agents, underscoring the need for a rigorous,\nresource-efficient approach that enhances participant engagement and ensures\nprivacy. This paper addresses this gap by introducing a modular approach and\nits resulting parameterized process for designing AI agents. We detail the\nsystem architecture, integrating engineered prompts, specialized knowledge\nbases, and customizable, goal-oriented conversational logic. We demonstrate the\nadaptability, generalizability, and efficacy of our modular approach through\nthree empirical studies: (1) travel preference surveys, highlighting\nconditional logic and multimodal (voice, text, and image generation)\ncapabilities; (2) public opinion elicitation on a newly constructed, novel\ninfrastructure project, showcasing question customization and multilingual\n(English and French) capabilities; and (3) expert consultation about the impact\nof technologies on future transportation systems, highlighting real-time,\nclarification request capabilities for open-ended questions, resilience in\nhandling erratic inputs, and efficient transcript postprocessing. The results\nsuggest that the AI agent increases completion rates and response quality.\nFurthermore, the modular approach demonstrates controllability, flexibility,\nand robustness while addressing key ethical, privacy, security, and token\nconsumption concerns.\n","authors":["Jiangbo Yu","Jinhua Zhao","Luis Miranda-Moreno","Matthew Korp"],"pdf_url":"https://arxiv.org/pdf/2412.17049v2.pdf","comment":null}],"Genomics":[{"id":"http://arxiv.org/abs/2406.13839v2","updated":"2025-03-04T20:59:58Z","published":"2024-06-19T21:06:44Z","title":"RNA-FrameFlow: Flow Matching for de novo 3D RNA Backbone Design","summary":" We introduce RNA-FrameFlow, the first generative model for 3D RNA backbone\ndesign. We build upon SE(3) flow matching for protein backbone generation and\nestablish protocols for data preparation and evaluation to address unique\nchallenges posed by RNA modeling. We formulate RNA structures as a set of\nrigid-body frames and associated loss functions which account for larger, more\nconformationally flexible RNA backbones (13 atoms per nucleotide) vs. proteins\n(4 atoms per residue). Toward tackling the lack of diversity in 3D RNA\ndatasets, we explore training with structural clustering and cropping\naugmentations. Additionally, we define a suite of evaluation metrics to measure\nwhether the generated RNA structures are globally self-consistent (via inverse\nfolding followed by forward folding) and locally recover RNA-specific\nstructural descriptors. The most performant version of RNA-FrameFlow generates\nlocally realistic RNA backbones of 40-150 nucleotides, over 40% of which pass\nour validity criteria as measured by a self-consistency TM-score >= 0.45, at\nwhich two RNAs have the same global fold. Open-source code:\nhttps://github.com/rish-16/rna-backbone-design\n","authors":["Rishabh Anand","Chaitanya K. Joshi","Alex Morehead","Arian R. Jamasb","Charles Harris","Simon V. Mathis","Kieran Didi","Bryan Hooi","Pietro Liò"],"pdf_url":"https://arxiv.org/pdf/2406.13839v2.pdf","comment":"Oral presentation at Machine Learning in Computational Biology\n (MLCB), 2024. Also presented as an Oral at ICML 2024 Structured Probabilistic\n Inference & Generative Modeling Workshop, and a Spotlight at ICML 2024\n AI4Science Workshop"},{"id":"http://arxiv.org/abs/2503.02997v1","updated":"2025-03-04T20:44:37Z","published":"2025-03-04T20:44:37Z","title":"Enabling Fast, Accurate, and Efficient Real-Time Genome Analysis via New\n Algorithms and Techniques","summary":" The advent of high-throughput sequencing technologies has revolutionized\ngenome analysis by enabling the rapid and cost-effective sequencing of large\ngenomes. Despite these advancements, the increasing complexity and volume of\ngenomic data present significant challenges related to accuracy, scalability,\nand computational efficiency. These challenges are mainly due to various forms\nof unwanted and unhandled variations in sequencing data, collectively referred\nto as noise. In this dissertation, we address these challenges by providing a\ndeep understanding of different types of noise in genomic data and developing\ntechniques to mitigate the impact of noise on genome analysis.\n First, we introduce BLEND, a noise-tolerant hashing mechanism that quickly\nidentifies both exactly matching and highly similar sequences with arbitrary\ndifferences using a single lookup of their hash values. Second, to enable\nscalable and accurate analysis of noisy raw nanopore signals, we propose\nRawHash, a novel mechanism that effectively reduces noise in raw nanopore\nsignals and enables accurate, real-time analysis by proposing the first\nhash-based similarity search technique for raw nanopore signals. Third, we\nextend the capabilities of RawHash with RawHash2, an improved mechanism that 1)\nprovides a better understanding of noise in raw nanopore signals to reduce it\nmore effectively and 2) improves the robustness of mapping decisions. Fourth,\nwe explore the broader implications and new applications of raw nanopore signal\nanalysis by introducing Rawsamble, the first mechanism for all-vs-all\noverlapping of raw signals using hash-based search. Rawsamble enables the\nconstruction of de novo assemblies directly from raw signals without\nbasecalling, which opens up new directions and uses for raw nanopore signal\nanalysis.\n","authors":["Can Firtina"],"pdf_url":"https://arxiv.org/pdf/2503.02997v1.pdf","comment":"PhD Thesis submitted to ETH Zurich"},{"id":"http://arxiv.org/abs/2503.03773v1","updated":"2025-03-04T06:53:03Z","published":"2025-03-04T06:53:03Z","title":"A Phylogenetic Approach to Genomic Language Modeling","summary":" Genomic language models (gLMs) have shown mostly modest success in\nidentifying evolutionarily constrained elements in mammalian genomes. To\naddress this issue, we introduce a novel framework for training gLMs that\nexplicitly models nucleotide evolution on phylogenetic trees using multispecies\nwhole-genome alignments. Our approach integrates an alignment into the loss\nfunction during training but does not require it for making predictions,\nthereby enhancing the model's applicability. We applied this framework to train\nPhyloGPN, a model that excels at predicting functionally disruptive variants\nfrom a single sequence alone and demonstrates strong transfer learning\ncapabilities.\n","authors":["Carlos Albors","Jianan Canal Li","Gonzalo Benegas","Chengzhong Ye","Yun S. Song"],"pdf_url":"https://arxiv.org/pdf/2503.03773v1.pdf","comment":"15 pages, 7 figures"}]},"2025-03-05T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2503.03751v1","updated":"2025-03-05T18:59:50Z","published":"2025-03-05T18:59:50Z","title":"GEN3C: 3D-Informed World-Consistent Video Generation with Precise Camera\n Control","summary":" We present GEN3C, a generative video model with precise Camera Control and\ntemporal 3D Consistency. Prior video models already generate realistic videos,\nbut they tend to leverage little 3D information, leading to inconsistencies,\nsuch as objects popping in and out of existence. Camera control, if implemented\nat all, is imprecise, because camera parameters are mere inputs to the neural\nnetwork which must then infer how the video depends on the camera. In contrast,\nGEN3C is guided by a 3D cache: point clouds obtained by predicting the\npixel-wise depth of seed images or previously generated frames. When generating\nthe next frames, GEN3C is conditioned on the 2D renderings of the 3D cache with\nthe new camera trajectory provided by the user. Crucially, this means that\nGEN3C neither has to remember what it previously generated nor does it have to\ninfer the image structure from the camera pose. The model, instead, can focus\nall its generative power on previously unobserved regions, as well as advancing\nthe scene state to the next frame. Our results demonstrate more precise camera\ncontrol than prior work, as well as state-of-the-art results in sparse-view\nnovel view synthesis, even in challenging settings such as driving scenes and\nmonocular dynamic video. Results are best viewed in videos. Check out our\nwebpage! https://research.nvidia.com/labs/toronto-ai/GEN3C/\n","authors":["Xuanchi Ren","Tianchang Shen","Jiahui Huang","Huan Ling","Yifan Lu","Merlin Nimier-David","Thomas Müller","Alexander Keller","Sanja Fidler","Jun Gao"],"pdf_url":"https://arxiv.org/pdf/2503.03751v1.pdf","comment":"To appear in CVPR 2025. Website:\n https://research.nvidia.com/labs/toronto-ai/GEN3C/"},{"id":"http://arxiv.org/abs/2412.04468v2","updated":"2025-03-05T18:57:01Z","published":"2024-12-05T18:59:55Z","title":"NVILA: Efficient Frontier Visual Language Models","summary":" Visual language models (VLMs) have made significant advances in accuracy in\nrecent years. However, their efficiency has received much less attention. This\npaper introduces NVILA, a family of open VLMs designed to optimize both\nefficiency and accuracy. Building on top of VILA, we improve its model\narchitecture by first scaling up the spatial and temporal resolutions, and then\ncompressing visual tokens. This \"scale-then-compress\" approach enables NVILA to\nefficiently process high-resolution images and long videos. We also conduct a\nsystematic investigation to enhance the efficiency of NVILA throughout its\nentire lifecycle, from training and fine-tuning to deployment. NVILA matches or\nsurpasses the accuracy of many leading open and proprietary VLMs across a wide\nrange of image and video benchmarks. At the same time, it reduces training\ncosts by 4.5X, fine-tuning memory usage by 3.4X, pre-filling latency by\n1.6-2.2X, and decoding latency by 1.2-2.8X. We will soon make our code and\nmodels available to facilitate reproducibility.\n","authors":["Zhijian Liu","Ligeng Zhu","Baifeng Shi","Zhuoyang Zhang","Yuming Lou","Shang Yang","Haocheng Xi","Shiyi Cao","Yuxian Gu","Dacheng Li","Xiuyu Li","Yunhao Fang","Yukang Chen","Cheng-Yu Hsieh","De-An Huang","An-Chieh Cheng","Vishwesh Nath","Jinyi Hu","Sifei Liu","Ranjay Krishna","Daguang Xu","Xiaolong Wang","Pavlo Molchanov","Jan Kautz","Hongxu Yin","Song Han","Yao Lu"],"pdf_url":"https://arxiv.org/pdf/2412.04468v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03734v1","updated":"2025-03-05T18:44:48Z","published":"2025-03-05T18:44:48Z","title":"OTTER: A Vision-Language-Action Model with Text-Aware Visual Feature\n Extraction","summary":" Vision-Language-Action (VLA) models aim to predict robotic actions based on\nvisual observations and language instructions. Existing approaches require\nfine-tuning pre-trained visionlanguage models (VLMs) as visual and language\nfeatures are independently fed into downstream policies, degrading the\npre-trained semantic alignments. We propose OTTER, a novel VLA architecture\nthat leverages these existing alignments through explicit, text-aware visual\nfeature extraction. Instead of processing all visual features, OTTER\nselectively extracts and passes only task-relevant visual features that are\nsemantically aligned with the language instruction to the policy transformer.\nThis allows OTTER to keep the pre-trained vision-language encoders frozen.\nThereby, OTTER preserves and utilizes the rich semantic understanding learned\nfrom large-scale pre-training, enabling strong zero-shot generalization\ncapabilities. In simulation and real-world experiments, OTTER significantly\noutperforms existing VLA models, demonstrating strong zeroshot generalization\nto novel objects and environments. Video, code, checkpoints, and dataset:\nhttps://ottervla.github.io/.\n","authors":["Huang Huang","Fangchen Liu","Letian Fu","Tingfan Wu","Mustafa Mukadam","Jitendra Malik","Ken Goldberg","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2503.03734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03733v1","updated":"2025-03-05T18:44:35Z","published":"2025-03-05T18:44:35Z","title":"Rethinking Deep Clustering Paradigms: Self-Supervision Is All You Need","summary":" The recent advances in deep clustering have been made possible by significant\nprogress in self-supervised and pseudo-supervised learning. However, the\ntrade-off between self-supervision and pseudo-supervision can give rise to\nthree primary issues. The joint training causes Feature Randomness and Feature\nDrift, whereas the independent training causes Feature Randomness and Feature\nTwist. In essence, using pseudo-labels generates random and unreliable\nfeatures. The combination of pseudo-supervision and self-supervision drifts the\nreliable clustering-oriented features. Moreover, moving from self-supervision\nto pseudo-supervision can twist the curved latent manifolds. This paper\naddresses the limitations of existing deep clustering paradigms concerning\nFeature Randomness, Feature Drift, and Feature Twist. We propose a new paradigm\nwith a new strategy that replaces pseudo-supervision with a second round of\nself-supervision training. The new strategy makes the transition between\ninstance-level self-supervision and neighborhood-level self-supervision\nsmoother and less abrupt. Moreover, it prevents the drifting effect that is\ncaused by the strong competition between instance-level self-supervision and\nclustering-level pseudo-supervision. Moreover, the absence of the\npseudo-supervision prevents the risk of generating random features. With this\nnovel approach, our paper introduces a Rethinking of the Deep Clustering\nParadigms, denoted by R-DC. Our model is specifically designed to address three\nprimary challenges encountered in Deep Clustering: Feature Randomness, Feature\nDrift, and Feature Twist. Experimental results conducted on six datasets have\nshown that the two-level self-supervision training yields substantial\nimprovements.\n","authors":["Amal Shaheena","Nairouz Mrabahb","Riadh Ksantinia","Abdulla Alqaddoumia"],"pdf_url":"https://arxiv.org/pdf/2503.03733v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03726v1","updated":"2025-03-05T18:28:32Z","published":"2025-03-05T18:28:32Z","title":"Active 6D Pose Estimation for Textureless Objects using Multi-View RGB\n Frames","summary":" Estimating the 6D pose of textureless objects from RBG images is an important\nproblem in robotics. Due to appearance ambiguities, rotational symmetries, and\nsevere occlusions, single-view based 6D pose estimators are still unable to\nhandle a wide range of objects, motivating research towards multi-view pose\nestimation and next-best-view prediction that addresses these limitations. In\nthis work, we propose a comprehensive active perception framework for\nestimating the 6D poses of textureless objects using only RGB images. Our\napproach is built upon a key idea: decoupling the 6D pose estimation into a\nsequential two-step process can greatly improve both accuracy and efficiency.\nFirst, we estimate the 3D translation of each object, resolving scale and depth\nambiguities inherent to RGB images. These estimates are then used to simplify\nthe subsequent task of determining the 3D orientation, which we achieve through\ncanonical scale template matching. Building on this formulation, we then\nintroduce an active perception strategy that predicts the next best camera\nviewpoint to capture an RGB image, effectively reducing object pose uncertainty\nand enhancing pose accuracy. We evaluate our method on the public ROBI dataset\nas well as on a transparent object dataset that we created. When evaluated\nusing the same camera viewpoints, our multi-view pose estimation significantly\noutperforms state-of-the-art approaches. Furthermore, by leveraging our\nnext-best-view strategy, our method achieves high object pose accuracy with\nsubstantially fewer viewpoints than heuristic-based policies.\n","authors":["Jun Yang","Wenjie Xue","Sahar Ghavidel","Steven L. Waslander"],"pdf_url":"https://arxiv.org/pdf/2503.03726v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03708v1","updated":"2025-03-05T17:59:19Z","published":"2025-03-05T17:59:19Z","title":"Rethinking Video Tokenization: A Conditioned Diffusion-based Approach","summary":" Video tokenizers, which transform videos into compact latent representations,\nare key to video generation. Existing video tokenizers are based on the VAE\narchitecture and follow a paradigm where an encoder compresses videos into\ncompact latents, and a deterministic decoder reconstructs the original videos\nfrom these latents. In this paper, we propose a novel\n\\underline{\\textbf{C}}onditioned \\underline{\\textbf{D}}iffusion-based video\n\\underline{\\textbf{T}}okenizer entitled \\textbf{\\ourmethod}, which departs from\nprevious methods by replacing the deterministic decoder with a 3D causal\ndiffusion model. The reverse diffusion generative process of the decoder is\nconditioned on the latent representations derived via the encoder. With a\nfeature caching and sampling acceleration, the framework efficiently\nreconstructs high-fidelity videos of arbitrary lengths. Results show that\n{\\ourmethod} achieves state-of-the-art performance in video reconstruction\ntasks using just a single-step sampling. Even a smaller version of {\\ourmethod}\nstill achieves reconstruction results on par with the top two baselines.\nFurthermore, the latent video generation model trained using {\\ourmethod} also\nshows superior performance.\n","authors":["Nianzu Yang","Pandeng Li","Liming Zhao","Yang Li","Chen-Wei Xie","Yehui Tang","Xudong Lu","Zhihang Liu","Yun Zheng","Yu Liu","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2503.03708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.11774v2","updated":"2025-03-05T17:57:48Z","published":"2024-10-15T16:55:10Z","title":"Fractal Calibration for long-tailed object detection","summary":" Real-world datasets follow an imbalanced distribution, which poses\nsignificant challenges in rare-category object detection. Recent studies tackle\nthis problem by developing re-weighting and re-sampling methods, that utilise\nthe class frequencies of the dataset. However, these techniques focus solely on\nthe frequency statistics and ignore the distribution of the classes in image\nspace, missing important information. In contrast to them, we propose FRActal\nCALibration (FRACAL): a novel post-calibration method for long-tailed object\ndetection. FRACAL devises a logit adjustment method that utilises the fractal\ndimension to estimate how uniformly classes are distributed in image space.\nDuring inference, it uses the fractal dimension to inversely downweight the\nprobabilities of uniformly spaced class predictions achieving balance in two\naxes: between frequent and rare categories, and between uniformly spaced and\nsparsely spaced classes. FRACAL is a post-processing method and it does not\nrequire any training, also it can be combined with many off-the-shelf models\nsuch as one-stage sigmoid detectors and two-stage instance segmentation models.\nFRACAL boosts the rare class performance by up to 8.6% and surpasses all\nprevious methods on LVIS dataset, while showing good generalisation to other\ndatasets such as COCO, V3Det and OpenImages. We provide the code at\nhttps://github.com/kostas1515/FRACAL.\n","authors":["Konstantinos Panagiotis Alexandridis","Ismail Elezi","Jiankang Deng","Anh Nguyen","Shan Luo"],"pdf_url":"https://arxiv.org/pdf/2410.11774v2.pdf","comment":"CVPR2025"},{"id":"http://arxiv.org/abs/2503.01776v2","updated":"2025-03-05T17:51:09Z","published":"2025-03-03T17:59:48Z","title":"Beyond Matryoshka: Revisiting Sparse Coding for Adaptive Representation","summary":" Many large-scale systems rely on high-quality deep representations\n(embeddings) to facilitate tasks like retrieval, search, and generative\nmodeling. Matryoshka Representation Learning (MRL) recently emerged as a\nsolution for adaptive embedding lengths, but it requires full model retraining\nand suffers from noticeable performance degradations at short lengths. In this\npaper, we show that sparse coding offers a compelling alternative for achieving\nadaptive representation with minimal overhead and higher fidelity. We propose\nContrastive Sparse Representation (CSR), a method that sparsifies pre-trained\nembeddings into a high-dimensional but selectively activated feature space. By\nleveraging lightweight autoencoding and task-aware contrastive objectives, CSR\npreserves semantic quality while allowing flexible, cost-effective inference at\ndifferent sparsity levels. Extensive experiments on image, text, and multimodal\nbenchmarks demonstrate that CSR consistently outperforms MRL in terms of both\naccuracy and retrieval speed-often by large margins-while also cutting training\ntime to a fraction of that required by MRL. Our results establish sparse coding\nas a powerful paradigm for adaptive representation learning in real-world\napplications where efficiency and fidelity are both paramount. Code is\navailable at https://github.com/neilwen987/CSR_Adaptive_Rep\n","authors":["Tiansheng Wen","Yifei Wang","Zequn Zeng","Zhong Peng","Yudi Su","Xinyang Liu","Bo Chen","Hongwei Liu","Stefanie Jegelka","Chenyu You"],"pdf_url":"https://arxiv.org/pdf/2503.01776v2.pdf","comment":"A novel sparse coding framework designed for learning adaptive\n representation"},{"id":"http://arxiv.org/abs/2503.03689v1","updated":"2025-03-05T17:31:45Z","published":"2025-03-05T17:31:45Z","title":"DualDiff+: Dual-Branch Diffusion for High-Fidelity Video Generation with\n Reward Guidance","summary":" Accurate and high-fidelity driving scene reconstruction demands the effective\nutilization of comprehensive scene information as conditional inputs. Existing\nmethods predominantly rely on 3D bounding boxes and BEV road maps for\nforeground and background control, which fail to capture the full complexity of\ndriving scenes and adequately integrate multimodal information. In this work,\nwe present DualDiff, a dual-branch conditional diffusion model designed to\nenhance driving scene generation across multiple views and video sequences.\nSpecifically, we introduce Occupancy Ray-shape Sampling (ORS) as a conditional\ninput, offering rich foreground and background semantics alongside 3D spatial\ngeometry to precisely control the generation of both elements. To improve the\nsynthesis of fine-grained foreground objects, particularly complex and distant\nones, we propose a Foreground-Aware Mask (FGM) denoising loss function.\nAdditionally, we develop the Semantic Fusion Attention (SFA) mechanism to\ndynamically prioritize relevant information and suppress noise, enabling more\neffective multimodal fusion. Finally, to ensure high-quality image-to-video\ngeneration, we introduce the Reward-Guided Diffusion (RGD) framework, which\nmaintains global consistency and semantic coherence in generated videos.\nExtensive experiments demonstrate that DualDiff achieves state-of-the-art\n(SOTA) performance across multiple datasets. On the NuScenes dataset, DualDiff\nreduces the FID score by 4.09% compared to the best baseline. In downstream\ntasks, such as BEV segmentation, our method improves vehicle mIoU by 4.50% and\nroad mIoU by 1.70%, while in BEV 3D object detection, the foreground mAP\nincreases by 1.46%. Code will be made available at\nhttps://github.com/yangzhaojason/DualDiff.\n","authors":["Zhao Yang","Zezhong Qian","Xiaofan Li","Weixiang Xu","Gongpeng Zhao","Ruohong Yu","Lingsi Zhu","Longjun Liu"],"pdf_url":"https://arxiv.org/pdf/2503.03689v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03664v1","updated":"2025-03-05T16:54:15Z","published":"2025-03-05T16:54:15Z","title":"A Generative Approach to High Fidelity 3D Reconstruction from Text Data","summary":" The convergence of generative artificial intelligence and advanced computer\nvision technologies introduces a groundbreaking approach to transforming\ntextual descriptions into three-dimensional representations. This research\nproposes a fully automated pipeline that seamlessly integrates text-to-image\ngeneration, various image processing techniques, and deep learning methods for\nreflection removal and 3D reconstruction. By leveraging state-of-the-art\ngenerative models like Stable Diffusion, the methodology translates natural\nlanguage inputs into detailed 3D models through a multi-stage workflow.\n The reconstruction process begins with the generation of high-quality images\nfrom textual prompts, followed by enhancement by a reinforcement learning agent\nand reflection removal using the Stable Delight model. Advanced image upscaling\nand background removal techniques are then applied to further enhance visual\nfidelity. These refined two-dimensional representations are subsequently\ntransformed into volumetric 3D models using sophisticated machine learning\nalgorithms, capturing intricate spatial relationships and geometric\ncharacteristics. This process achieves a highly structured and detailed output,\nensuring that the final 3D models reflect both semantic accuracy and geometric\nprecision.\n This approach addresses key challenges in generative reconstruction, such as\nmaintaining semantic coherence, managing geometric complexity, and preserving\ndetailed visual information. Comprehensive experimental evaluations will assess\nreconstruction quality, semantic accuracy, and geometric fidelity across\ndiverse domains and varying levels of complexity. By demonstrating the\npotential of AI-driven 3D reconstruction techniques, this research offers\nsignificant implications for fields such as augmented reality (AR), virtual\nreality (VR), and digital content creation.\n","authors":["Venkat Kumar R","Deepak Saravanan"],"pdf_url":"https://arxiv.org/pdf/2503.03664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03663v1","updated":"2025-03-05T16:52:34Z","published":"2025-03-05T16:52:34Z","title":"LION-FS: Fast & Slow Video-Language Thinker as Online Video Assistant","summary":" First-person video assistants are highly anticipated to enhance our daily\nlives through online video dialogue. However, existing online video assistants\noften sacrifice assistant efficacy for real-time efficiency by processing\nlow-frame-rate videos with coarse-grained visual features.To overcome the\ntrade-off between efficacy and efficiency, we propose \"Fast & Slow\nVideo-Language Thinker\" as an onLIne videO assistaNt, LION-FS, achieving\nreal-time, proactive, temporally accurate, and contextually precise responses.\nLION-FS adopts a two-stage optimization strategy: 1)Fast Path: Routing-Based\nResponse Determination evaluates frame-by-frame whether an immediate response\nis necessary. To enhance response determination accuracy and handle higher\nframe-rate inputs efficiently, we employ Token Aggregation Routing to\ndynamically fuse spatiotemporal features without increasing token numbers,\nwhile utilizing Token Dropping Routing to eliminate redundant features. 2)Slow\nPath: Multi-granularity Keyframe Augmentation optimizes keyframes during\nresponse generation. To provide comprehensive and detailed responses beyond\natomic actions constrained by training data, fine-grained spatial features and\nhuman-environment interaction features are extracted through multi-granular\npooling. These features are further integrated into a meticulously designed\nmultimodal Thinking Template to guide more precise response generation.\nComprehensive evaluations on online video tasks demonstrate that LION-FS\nachieves state-of-the-art efficacy and efficiency.\n","authors":["Wei Li","Bing Hu","Rui Shao","Leyang Shen","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2503.03663v1.pdf","comment":"Accept to CVPR 2025"},{"id":"http://arxiv.org/abs/2409.07402v2","updated":"2025-03-05T16:48:23Z","published":"2024-09-11T16:42:22Z","title":"What to align in multimodal contrastive learning?","summary":" Humans perceive the world through multisensory integration, blending the\ninformation of different modalities to adapt their behavior. Contrastive\nlearning offers an appealing solution for multimodal self-supervised learning.\nIndeed, by considering each modality as a different view of the same entity, it\nlearns to align features of different modalities in a shared representation\nspace. However, this approach is intrinsically limited as it only learns shared\nor redundant information between modalities, while multimodal interactions can\narise in other ways. In this work, we introduce CoMM, a Contrastive MultiModal\nlearning strategy that enables the communication between modalities in a single\nmultimodal space. Instead of imposing cross- or intra- modality constraints, we\npropose to align multimodal representations by maximizing the mutual\ninformation between augmented versions of these multimodal features. Our\ntheoretical analysis shows that shared, synergistic and unique terms of\ninformation naturally emerge from this formulation, allowing us to estimate\nmultimodal interactions beyond redundancy. We test CoMM both in a controlled\nand in a series of real-world settings: in the former, we demonstrate that CoMM\neffectively captures redundant, unique and synergistic information between\nmodalities. In the latter, CoMM learns complex multimodal interactions and\nachieves state-of-the-art results on the seven multimodal benchmarks. Code is\navailable at https://github.com/Duplums/CoMM\n","authors":["Benoit Dufumier","Javiera Castillo-Navarro","Devis Tuia","Jean-Philippe Thiran"],"pdf_url":"https://arxiv.org/pdf/2409.07402v2.pdf","comment":"ICLR 2025, 25 pages"},{"id":"http://arxiv.org/abs/2503.03655v1","updated":"2025-03-05T16:35:15Z","published":"2025-03-05T16:35:15Z","title":"Improving 6D Object Pose Estimation of metallic Household and Industry\n Objects","summary":" 6D object pose estimation suffers from reduced accuracy when applied to\nmetallic objects. We set out to improve the state-of-the-art by addressing\nchallenges such as reflections and specular highlights in industrial\napplications. Our novel BOP-compatible dataset, featuring a diverse set of\nmetallic objects (cans, household, and industrial items) under various lighting\nand background conditions, provides additional geometric and visual cues. We\ndemonstrate that these cues can be effectively leveraged to enhance overall\nperformance. To illustrate the usefulness of the additional features, we\nimprove upon the GDRNPP algorithm by introducing an additional keypoint\nprediction and material estimator head in order to improve spatial scene\nunderstanding. Evaluations on the new dataset show improved accuracy for\nmetallic objects, supporting the hypothesis that additional geometric and\nvisual cues can improve learning.\n","authors":["Thomas Pöllabauer","Michael Gasser","Tristan Wirth","Sarah Berkei","Volker Knauthe","Arjan Kuijper"],"pdf_url":"https://arxiv.org/pdf/2503.03655v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03651v1","updated":"2025-03-05T16:26:58Z","published":"2025-03-05T16:26:58Z","title":"DoraCycle: Domain-Oriented Adaptation of Unified Generative Model in\n Multimodal Cycles","summary":" Adapting generative models to specific domains presents an effective solution\nfor satisfying specialized requirements. However, adapting to some complex\ndomains remains challenging, especially when these domains require substantial\npaired data to capture the targeted distributions. Since unpaired data from a\nsingle modality, such as vision or language, is more readily available, we\nutilize the bidirectional mappings between vision and language learned by the\nunified generative model to enable training on unpaired data for domain\nadaptation. Specifically, we propose DoraCycle, which integrates two multimodal\ncycles: text-to-image-to-text and image-to-text-to-image. The model is\noptimized through cross-entropy loss computed at the cycle endpoints, where\nboth endpoints share the same modality. This facilitates self-evolution of the\nmodel without reliance on annotated text-image pairs. Experimental results\ndemonstrate that for tasks independent of paired knowledge, such as\nstylization, DoraCycle can effectively adapt the unified model using only\nunpaired data. For tasks involving new paired knowledge, such as specific\nidentities, a combination of a small set of paired image-text examples and\nlarger-scale unpaired data is sufficient for effective domain-oriented\nadaptation. The code will be released at https://github.com/showlab/DoraCycle.\n","authors":["Rui Zhao","Weijia Mao","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2503.03651v1.pdf","comment":"CVPR 2025"},{"id":"http://arxiv.org/abs/2503.03644v1","updated":"2025-03-05T16:20:53Z","published":"2025-03-05T16:20:53Z","title":"DongbaMIE: A Multimodal Information Extraction Dataset for Evaluating\n Semantic Understanding of Dongba Pictograms","summary":" Dongba pictographs are the only pictographs still in use in the world. They\nhave pictorial ideographic features, and their symbols carry rich cultural and\ncontextual information. Due to the lack of relevant datasets, existing research\nhas difficulty in advancing the study of semantic understanding of Dongba\npictographs. To this end, we propose DongbaMIE, the first multimodal dataset\nfor semantic understanding and extraction of Dongba pictographs. The dataset\nconsists of Dongba pictograph images and their corresponding Chinese semantic\nannotations. It contains 23,530 sentence-level and 2,539 paragraph-level\nimages, covering four semantic dimensions: objects, actions, relations, and\nattributes. We systematically evaluate the GPT-4o, Gemini-2.0, and Qwen2-VL\nmodels. Experimental results show that the F1 scores of GPT-4o and Gemini in\nthe best object extraction are only 3.16 and 3.11 respectively. The F1 score of\nQwen2-VL after supervised fine-tuning is only 11.49. These results suggest that\ncurrent large multimodal models still face significant challenges in accurately\nrecognizing the diverse semantic information in Dongba pictographs. The dataset\ncan be obtained from this URL.\n","authors":["Xiaojun Bi","Shuo Li","Ziyue Wang","Fuwen Luo","Weizheng Qiao","Lu Han","Ziwei Sun","Peng Li","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2503.03644v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03640v1","updated":"2025-03-05T16:19:56Z","published":"2025-03-05T16:19:56Z","title":"An Adaptive Underwater Image Enhancement Framework via Multi-Domain\n Fusion and Color Compensation","summary":" Underwater optical imaging is severely degraded by light absorption,\nscattering, and color distortion, hindering visibility and accurate image\nanalysis. This paper presents an adaptive enhancement framework integrating\nillumination compensation, multi-domain filtering, and dynamic color\ncorrection. A hybrid illumination compensation strategy combining CLAHE, Gamma\ncorrection, and Retinex enhances visibility. A two-stage filtering process,\nincluding spatial-domain (Gaussian, Bilateral, Guided) and frequency-domain\n(Fourier, Wavelet) methods, effectively reduces noise while preserving details.\nTo correct color distortion, an adaptive color compensation (ACC) model\nestimates spectral attenuation and water type to combine RCP, DCP, and MUDCP\ndynamically. Finally, a perceptually guided color balance mechanism ensures\nnatural color restoration. Experimental results on benchmark datasets\ndemonstrate superior performance over state-of-the-art methods in contrast\nenhancement, color correction, and structural preservation, making the\nframework robust for underwater imaging applications.\n","authors":["Yuezhe Tian","Kangchen Yao","Xiaoyang Yu"],"pdf_url":"https://arxiv.org/pdf/2503.03640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03637v1","updated":"2025-03-05T16:16:46Z","published":"2025-03-05T16:16:46Z","title":"4D Radar Ground Truth Augmentation with LiDAR-to-4D Radar Data Synthesis","summary":" Ground truth augmentation (GT-Aug) is a common method for LiDAR-based object\ndetection, as it enhances object density by leveraging ground truth bounding\nboxes (GT bboxes). However, directly applying GT-Aug to 4D Radar tensor data\noverlooks important measurements outside the GT bboxes-such as\nsidelobes-leading to synthetic distributions that deviate from real-world 4D\nRadar data. To address this limitation, we propose 4D Radar Ground Truth\nAugmentation (4DR GT-Aug). Our approach first augments LiDAR data and then\nconverts it to 4D Radar data via a LiDAR-to-4D Radar data synthesis (L2RDaS)\nmodule, which explicitly accounts for measurements both inside and outside GT\nbboxes. In doing so, it produces 4D Radar data distributions that more closely\nresemble real-world measurements, thereby improving object detection accuracy.\nExperiments on the K-Radar dataset show that the proposed method achieves\nimproved performance compared to conventional GT-Aug in object detection for 4D\nRadar. The implementation code is available at\nhttps://github.com/kaist-avelab/K-Radar.\n","authors":["Woo-Jin Jung","Dong-Hee Paek","Seung-Hyun Kong"],"pdf_url":"https://arxiv.org/pdf/2503.03637v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2410.08642v2","updated":"2025-03-05T15:55:52Z","published":"2024-10-11T09:10:26Z","title":"More than Memes: A Multimodal Topic Modeling Approach to Conspiracy\n Theories on Telegram","summary":" To address the increasing prevalence of (audio-)visual data on social media,\nand to capture the evolving and dynamic nature of this communication,\nresearchers have begun to explore the potential of unsupervised approaches for\nanalyzing multimodal online content. However, existing research often neglects\nvisual content beyond memes, and in addition lacks methods to compare topic\nmodels across modalities. Our study addresses these gaps by applying multimodal\ntopic modeling for analyzing conspiracy theories in German-language Telegram\nchannels. We use BERTopic with CLIP for the analysis of textual and visual data\nin a corpus of ~40, 000 Telegram messages posted in October 2023 in 571\nGerman-language Telegram channels known for disseminating conspiracy theories.\nThrough this dataset, we provide insights into unimodal and multimodal topic\nmodels by analyzing symmetry and intersections of topics across modalities. We\ndemonstrate the variety of textual and visual content shared in the channels\ndiscovered through the topic modeling, and propose a conceptual framework for\nthe analysis of textual and visual discursive strategies in the communication\nof conspiracy theories. We apply the framework in a case study of the topic\ngroup Israel Gaza.\n","authors":["Elisabeth Steffen"],"pdf_url":"https://arxiv.org/pdf/2410.08642v2.pdf","comment":"12 pages, 10 figures"},{"id":"http://arxiv.org/abs/2412.17741v4","updated":"2025-03-05T15:55:51Z","published":"2024-12-23T17:44:05Z","title":"Reasoning to Attend: Try to Understand How Token Works","summary":" Current Large Multimodal Models (LMMs) empowered visual grounding typically\nrely on $\\texttt{}$ token as a text prompt to jointly optimize the\nvision-language model (e.g., LLaVA) and the downstream task-specified model\n(\\eg, SAM). However, we observe that little research has looked into how it\nworks. In this work, we first visualize the similarity maps, which are obtained\nby computing the semantic similarity between the $\\texttt{}$ token and the\nimage token embeddings derived from the last hidden layer in both the LLaVA\nencoder and SAM decoder. Intriguingly, we have found that a striking\nconsistency holds in terms of activation responses in the similarity map,which\nreveals that what $\\texttt{}$ token contributes to is the semantic\nsimilarity within image-text pairs. Specifically, $\\texttt{}$ token, a\nplaceholder expanded in text vocabulary, extensively queries among individual\ntokenized image patches to match the semantics of an object from text to the\npaired image while the Large Language Models (LLMs) are being fine-tuned. Upon\nthe above findings, we present READ, which facilitates LMMs' resilient\n$\\textbf{REA}$soning capability of where to atten$\\textbf{D}$ under the\nguidance of highly activated points borrowed from similarity maps. Remarkably,\nREAD features an intuitive design, Similarity as Points module (SasP), which\ncan be seamlessly applied to $\\texttt{}$-like paradigms in a plug-and-play\nfashion. Also, extensive experiments have been conducted on the ReasonSeg and\nRefCOCO(+/g) datasets. To validate whether READ suffers from catastrophic\nforgetting of previous skills after fine-tuning, we further assess its\ngeneration ability on an augmented FP-RefCOCO(+/g) dataset. All codes and\nmodels are publicly available at https://github.com/rui-qian/READ.\n","authors":["Rui Qian","Xin Yin","Dejing Dou"],"pdf_url":"https://arxiv.org/pdf/2412.17741v4.pdf","comment":"This work has been accepted to CVPR 2025, please refer to\n https://github.com/rui-qian/READ"},{"id":"http://arxiv.org/abs/2503.03613v1","updated":"2025-03-05T15:51:59Z","published":"2025-03-05T15:51:59Z","title":"CLIP is Strong Enough to Fight Back: Test-time Counterattacks towards\n Zero-shot Adversarial Robustness of CLIP","summary":" Despite its prevalent use in image-text matching tasks in a zero-shot manner,\nCLIP has been shown to be highly vulnerable to adversarial perturbations added\nonto images. Recent studies propose to finetune the vision encoder of CLIP with\nadversarial samples generated on the fly, and show improved robustness against\nadversarial attacks on a spectrum of downstream datasets, a property termed as\nzero-shot robustness. In this paper, we show that malicious perturbations that\nseek to maximise the classification loss lead to `falsely stable' images, and\npropose to leverage the pre-trained vision encoder of CLIP to counterattack\nsuch adversarial images during inference to achieve robustness. Our paradigm is\nsimple and training-free, providing the first method to defend CLIP from\nadversarial attacks at test time, which is orthogonal to existing methods\naiming to boost zero-shot adversarial robustness of CLIP. We conduct\nexperiments across 16 classification datasets, and demonstrate stable and\nconsistent gains compared to test-time defence methods adapted from existing\nadversarial robustness studies that do not rely on external networks, without\nnoticeably impairing performance on clean images. We also show that our\nparadigm can be employed on CLIP models that have been adversarially finetuned\nto further enhance their robustness at test time. Our code is available\n\\href{https://github.com/Sxing2/CLIP-Test-time-Counterattacks}{here}.\n","authors":["Songlong Xing","Zhengyu Zhao","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2503.03613v1.pdf","comment":"Accepted to CVPR 2025"},{"id":"http://arxiv.org/abs/2411.05738v2","updated":"2025-03-05T15:51:07Z","published":"2024-11-08T17:54:18Z","title":"StdGEN: Semantic-Decomposed 3D Character Generation from Single Images","summary":" We present StdGEN, an innovative pipeline for generating semantically\ndecomposed high-quality 3D characters from single images, enabling broad\napplications in virtual reality, gaming, and filmmaking, etc. Unlike previous\nmethods which struggle with limited decomposability, unsatisfactory quality,\nand long optimization times, StdGEN features decomposability, effectiveness and\nefficiency; i.e., it generates intricately detailed 3D characters with\nseparated semantic components such as the body, clothes, and hair, in three\nminutes. At the core of StdGEN is our proposed Semantic-aware Large\nReconstruction Model (S-LRM), a transformer-based generalizable model that\njointly reconstructs geometry, color and semantics from multi-view images in a\nfeed-forward manner. A differentiable multi-layer semantic surface extraction\nscheme is introduced to acquire meshes from hybrid implicit fields\nreconstructed by our S-LRM. Additionally, a specialized efficient multi-view\ndiffusion model and an iterative multi-layer surface refinement module are\nintegrated into the pipeline to facilitate high-quality, decomposable 3D\ncharacter generation. Extensive experiments demonstrate our state-of-the-art\nperformance in 3D anime character generation, surpassing existing baselines by\na significant margin in geometry, texture and decomposability. StdGEN offers\nready-to-use semantic-decomposed 3D characters and enables flexible\ncustomization for a wide range of applications. Project page:\nhttps://stdgen.github.io\n","authors":["Yuze He","Yanning Zhou","Wang Zhao","Zhongkai Wu","Kaiwen Xiao","Wei Yang","Yong-Jin Liu","Xiao Han"],"pdf_url":"https://arxiv.org/pdf/2411.05738v2.pdf","comment":"CVPR 2025. 13 pages, 10 figures"},{"id":"http://arxiv.org/abs/2403.07746v3","updated":"2025-03-05T15:35:06Z","published":"2024-03-12T15:28:51Z","title":"Unleashing HyDRa: Hybrid Fusion, Depth Consistency and Radar for Unified\n 3D Perception","summary":" Low-cost, vision-centric 3D perception systems for autonomous driving have\nmade significant progress in recent years, narrowing the gap to expensive\nLiDAR-based methods. The primary challenge in becoming a fully reliable\nalternative lies in robust depth prediction capabilities, as camera-based\nsystems struggle with long detection ranges and adverse lighting and weather\nconditions. In this work, we introduce HyDRa, a novel camera-radar fusion\narchitecture for diverse 3D perception tasks. Building upon the principles of\ndense BEV (Bird's Eye View)-based architectures, HyDRa introduces a hybrid\nfusion approach to combine the strengths of complementary camera and radar\nfeatures in two distinct representation spaces. Our Height Association\nTransformer module leverages radar features already in the perspective view to\nproduce more robust and accurate depth predictions. In the BEV, we refine the\ninitial sparse representation by a Radar-weighted Depth Consistency. HyDRa\nachieves a new state-of-the-art for camera-radar fusion of 64.2 NDS (+1.8) and\n58.4 AMOTA (+1.5) on the public nuScenes dataset. Moreover, our new\nsemantically rich and spatially accurate BEV features can be directly converted\ninto a powerful occupancy representation, beating all previous camera-based\nmethods on the Occ3D benchmark by an impressive 3.7 mIoU. Code and models are\navailable at https://github.com/phi-wol/hydra.\n","authors":["Philipp Wolters","Johannes Gilg","Torben Teepe","Fabian Herzog","Anouar Laouichi","Martin Hofmann","Gerhard Rigoll"],"pdf_url":"https://arxiv.org/pdf/2403.07746v3.pdf","comment":"10 pages, 7 figures, added eval on VoD, added appendix"},{"id":"http://arxiv.org/abs/2503.03599v1","updated":"2025-03-05T15:32:38Z","published":"2025-03-05T15:32:38Z","title":"REGRACE: A Robust and Efficient Graph-based Re-localization Algorithm\n using Consistency Evaluation","summary":" Loop closures are essential for correcting odometry drift and creating\nconsistent maps, especially in the context of large-scale navigation. Current\nmethods using dense point clouds for accurate place recognition do not scale\nwell due to computationally expensive scan-to-scan comparisons. Alternative\nobject-centric approaches are more efficient but often struggle with\nsensitivity to viewpoint variation. In this work, we introduce REGRACE, a novel\napproach that addresses these challenges of scalability and perspective\ndifference in re-localization by using LiDAR-based submaps. We introduce\nrotation-invariant features for each labeled object and enhance them with\nneighborhood context through a graph neural network. To identify potential\nrevisits, we employ a scalable bag-of-words approach, pooling one learned\nglobal feature per submap. Additionally, we define a revisit with geometrical\nconsistency cues rather than embedding distance, allowing us to recognize\nfar-away loop closures. Our evaluations demonstrate that REGRACE achieves\nsimilar results compared to state-of-the-art place recognition and registration\nbaselines while being twice as fast.\n","authors":["Débora N. P. Oliveira","Joshua Knights","Sebastián Barbas Laina","Simon Boche","Wolfram Burgard","Stefan Leutenegger"],"pdf_url":"https://arxiv.org/pdf/2503.03599v1.pdf","comment":"Submitted to IROS2025"},{"id":"http://arxiv.org/abs/2501.01999v2","updated":"2025-03-05T15:26:17Z","published":"2025-01-01T07:00:41Z","title":"On the Utility of Equivariance and Symmetry Breaking in Deep Learning\n Architectures on Point Clouds","summary":" This paper explores the key factors that influence the performance of models\nworking with point clouds, across different tasks of varying geometric\ncomplexity. In this work, we explore the trade-offs between flexibility and\nweight-sharing introduced by equivariant layers, assessing when equivariance\nboosts or detracts from performance. It is often argued that providing more\ninformation as input improves a model's performance. However, if this\nadditional information breaks certain properties, such as $\\SE(3)$\nequivariance, does it remain beneficial? We identify the key aspects of\nequivariant and non-equivariant architectures that drive success in different\ntasks by benchmarking them on segmentation, regression, and generation tasks\nacross multiple datasets with increasing complexity. We observe a positive\nimpact of equivariance, which becomes more pronounced with increasing task\ncomplexity, even when strict equivariance is not required.\n","authors":["Sharvaree Vadgama","Mohammad Mohaiminul Islam","Domas Buracus","Christian Shewmake","Erik Bekkers"],"pdf_url":"https://arxiv.org/pdf/2501.01999v2.pdf","comment":"19 pages, 4 figures"},{"id":"http://arxiv.org/abs/2410.05096v2","updated":"2025-03-05T15:26:13Z","published":"2024-10-07T14:50:56Z","title":"Human-in-the-loop Reasoning For Traffic Sign Detection: Collaborative\n Approach Yolo With Video-llava","summary":" Traffic Sign Recognition (TSR) detection is a crucial component of autonomous\nvehicles. While You Only Look Once (YOLO) is a popular real-time object\ndetection algorithm, factors like training data quality and adverse weather\nconditions (e.g., heavy rain) can lead to detection failures. These failures\ncan be particularly dangerous when visual similarities between objects exist,\nsuch as mistaking a 30 km/h sign for a higher speed limit sign. This paper\nproposes a method that combines video analysis and reasoning, prompting with a\nhuman-in-the-loop guide large vision model to improve YOLOs accuracy in\ndetecting road speed limit signs, especially in semi-real-world conditions. It\nis hypothesized that the guided prompting and reasoning abilities of\nVideo-LLava can enhance YOLOs traffic sign detection capabilities. This\nhypothesis is supported by an evaluation based on human-annotated accuracy\nmetrics within a dataset of recorded videos from the CARLA car simulator. The\nresults demonstrate that a collaborative approach combining YOLO with\nVideo-LLava and reasoning can effectively address challenging situations such\nas heavy rain and overcast conditions that hinder YOLOs detection capabilities.\n","authors":["Mehdi Azarafza","Fatima Idrees","Ali Ehteshami Bejnordi","Charles Steinmetz","Stefan Henkler","Achim Rettberg"],"pdf_url":"https://arxiv.org/pdf/2410.05096v2.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.16215v2","updated":"2025-03-05T14:49:21Z","published":"2024-09-24T16:21:27Z","title":"Tiny Robotics Dataset and Benchmark for Continual Object Detection","summary":" Detecting objects in mobile robotics is crucial for numerous applications,\nfrom autonomous navigation to inspection. However, robots often need to operate\nin different domains from those they were trained in, requiring them to adjust\nto these changes. Tiny mobile robots, subject to size, power, and computational\nconstraints, encounter even more difficulties in running and adapting these\nalgorithms. Such adaptability, though, is crucial for real-world deployment,\nwhere robots must operate effectively in dynamic and unpredictable settings. In\nthis work, we introduce a novel benchmark to evaluate the continual learning\ncapabilities of object detection systems in tiny robotic platforms. Our\ncontributions include: (i) Tiny Robotics Object Detection~(TiROD), a\ncomprehensive dataset collected using the onboard camera of a small mobile\nrobot, designed to test object detectors across various domains and classes;\n(ii) a benchmark of different continual learning strategies on this dataset\nusing NanoDet, a lightweight object detector. Our results highlight key\nchallenges in developing robust and efficient continual learning strategies for\nobject detectors in tiny robotics.\n","authors":["Francesco Pasti","Riccardo De Monte","Davide Dalle Pezze","Gian Antonio Susto","Nicola Bellotto"],"pdf_url":"https://arxiv.org/pdf/2409.16215v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03562v1","updated":"2025-03-05T14:49:08Z","published":"2025-03-05T14:49:08Z","title":"Towards Visual Discrimination and Reasoning of Real-World Physical\n Dynamics: Physics-Grounded Anomaly Detection","summary":" Humans detect real-world object anomalies by perceiving, interacting, and\nreasoning based on object-conditioned physical knowledge. The long-term goal of\nIndustrial Anomaly Detection (IAD) is to enable machines to autonomously\nreplicate this skill. However, current IAD algorithms are largely developed and\ntested on static, semantically simple datasets, which diverge from real-world\nscenarios where physical understanding and reasoning are essential.To bridge\nthis gap, we introduce the Physics Anomaly Detection (Phys-AD) dataset, the\nfirst large-scale, real-world, physics-grounded video dataset for industrial\nanomaly detection. Collected using a real robot arm and motor, Phys-AD provides\na diverse set of dynamic, semantically rich scenarios. The dataset includes\nmore than 6400 videos across 22 real-world object categories, interacting with\nrobot arms and motors, and exhibits 47 types of anomalies. Anomaly detection in\nPhys-AD requires visual reasoning, combining both physical knowledge and video\ncontent to determine object abnormality.We benchmark state-of-the-art anomaly\ndetection methods under three settings: unsupervised AD, weakly-supervised AD,\nand video-understanding AD, highlighting their limitations in handling\nphysics-grounded anomalies. Additionally, we introduce the Physics Anomaly\nExplanation (PAEval) metric, designed to assess the ability of visual-language\nfoundation models to not only detect anomalies but also provide accurate\nexplanations for their underlying physical causes. Our dataset and benchmark\nwill be publicly available.\n","authors":["Wenqiao Li","Yao Gu","Xintao Chen","Xiaohao Xu","Ming Hu","Xiaonan Huang","Yingna Wu"],"pdf_url":"https://arxiv.org/pdf/2503.03562v1.pdf","comment":"Accepted by CVPR 2025"},{"id":"http://arxiv.org/abs/2411.13982v2","updated":"2025-03-05T14:45:55Z","published":"2024-11-21T09:47:13Z","title":"Safety Without Semantic Disruptions: Editing-free Safe Image Generation\n via Context-preserving Dual Latent Reconstruction","summary":" Training multimodal generative models on large, uncurated datasets can result\nin users being exposed to harmful, unsafe and controversial or\nculturally-inappropriate outputs. While model editing has been proposed to\nremove or filter undesirable concepts in embedding and latent spaces, it can\ninadvertently damage learned manifolds, distorting concepts in close semantic\nproximity. We identify limitations in current model editing techniques, showing\nthat even benign, proximal concepts may become misaligned. To address the need\nfor safe content generation, we leverage safe embeddings and a modified\ndiffusion process with tunable weighted summation in the latent space to\ngenerate safer images. Our method preserves global context without compromising\nthe structural integrity of the learned manifolds. We achieve state-of-the-art\nresults on safe image generation benchmarks and offer intuitive control over\nthe level of model safety. We identify trade-offs between safety and\ncensorship, which presents a necessary perspective in the development of\nethical AI models. We will release our code.\n Keywords: Text-to-Image Models, Generative AI, Safety, Reliability, Model\nEditing\n","authors":["Jordan Vice","Naveed Akhtar","Mubarak Shah","Richard Hartley","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2411.13982v2.pdf","comment":"This research is supported by the NISDRG project #20100007, funded by\n the Australian Government"},{"id":"http://arxiv.org/abs/2503.03558v1","updated":"2025-03-05T14:45:32Z","published":"2025-03-05T14:45:32Z","title":"High-Quality Virtual Single-Viewpoint Surgical Video: Geometric\n Autocalibration of Multiple Cameras in Surgical Lights","summary":" Occlusion-free video generation is challenging due to surgeons' obstructions\nin the camera field of view. Prior work has addressed this issue by installing\nmultiple cameras on a surgical light, hoping some cameras will observe the\nsurgical field with less occlusion. However, this special camera setup poses a\nnew imaging challenge since camera configurations can change every time\nsurgeons move the light, and manual image alignment is required. This paper\nproposes an algorithm to automate this alignment task. The proposed method\ndetects frames where the lighting system moves, realigns them, and selects the\ncamera with the least occlusion. This algorithm results in a stabilized video\nwith less occlusion. Quantitative results show that our method outperforms\nconventional approaches. A user study involving medical doctors also confirmed\nthe superiority of our method.\n","authors":["Yuna Kato","Mariko Isogawa","Shohei Mori","Hideo Saito","Hiroki Kajita","Yoshifumi Takatsume"],"pdf_url":"https://arxiv.org/pdf/2503.03558v1.pdf","comment":"Accepted at MICCAI2023"},{"id":"http://arxiv.org/abs/2503.03556v1","updated":"2025-03-05T14:44:53Z","published":"2025-03-05T14:44:53Z","title":"Afford-X: Generalizable and Slim Affordance Reasoning for Task-oriented\n Manipulation","summary":" Object affordance reasoning, the ability to infer object functionalities\nbased on physical properties, is fundamental for task-oriented planning and\nactivities in both humans and Artificial Intelligence (AI). This capability,\nrequired for planning and executing daily activities in a task-oriented manner,\nrelies on commonsense knowledge of object physics and functionalities,\nextending beyond simple object recognition. Current computational models for\naffordance reasoning from perception lack generalizability, limiting their\napplicability in novel scenarios. Meanwhile, comprehensive Large Language\nModels (LLMs) with emerging reasoning capabilities are challenging to deploy on\nlocal devices for task-oriented manipulations. Here, we introduce LVIS-Aff, a\nlarge-scale dataset comprising 1,496 tasks and 119k images, designed to enhance\nthe generalizability of affordance reasoning from perception. Utilizing this\ndataset, we develop Afford-X, an end-to-end trainable affordance reasoning\nmodel that incorporates Verb Attention and Bi-Fusion modules to improve\nmulti-modal understanding. This model achieves up to a 12.1% performance\nimprovement over the best-reported results from non-LLM methods, while also\ndemonstrating a 1.2% enhancement compared to our previous conference paper.\nAdditionally, it maintains a compact 187M parameter size and infers nearly 50\ntimes faster than the GPT-4V API. Our work demonstrates the potential for\nefficient, generalizable affordance reasoning models that can be deployed on\nlocal devices for task-oriented manipulations. We showcase Afford-X's\neffectiveness in enabling task-oriented manipulations for robots across various\ntasks and environments, underscoring its efficiency and broad implications for\nadvancing robotics and AI systems in real-world applications.\n","authors":["Xiaomeng Zhu","Yuyang Li","Leiyao Cui","Pengfei Li","Huan-ang Gao","Yixin Zhu","Hao Zhao"],"pdf_url":"https://arxiv.org/pdf/2503.03556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09781v2","updated":"2025-03-05T14:44:18Z","published":"2025-01-16T18:59:10Z","title":"VideoWorld: Exploring Knowledge Learning from Unlabeled Videos","summary":" This work explores whether a deep generative model can learn complex\nknowledge solely from visual input, in contrast to the prevalent focus on\ntext-based models like large language models (LLMs). We develop VideoWorld, an\nauto-regressive video generation model trained on unlabeled video data, and\ntest its knowledge acquisition abilities in video-based Go and robotic control\ntasks. Our experiments reveal two key findings: (1) video-only training\nprovides sufficient information for learning knowledge, including rules,\nreasoning and planning capabilities, and (2) the representation of visual\nchange is crucial for knowledge acquisition. To improve both the efficiency and\nefficacy of this process, we introduce the Latent Dynamics Model (LDM) as a key\ncomponent of VideoWorld. Remarkably, VideoWorld reaches a 5-dan professional\nlevel in the Video-GoBench with just a 300-million-parameter model, without\nrelying on search algorithms or reward mechanisms typical in reinforcement\nlearning. In robotic tasks, VideoWorld effectively learns diverse control\noperations and generalizes across environments, approaching the performance of\noracle models in CALVIN and RLBench. This study opens new avenues for knowledge\nacquisition from visual data, with all code, data, and models open-sourced for\nfurther research.\n","authors":["Zhongwei Ren","Yunchao Wei","Xun Guo","Yao Zhao","Bingyi Kang","Jiashi Feng","Xiaojie Jin"],"pdf_url":"https://arxiv.org/pdf/2501.09781v2.pdf","comment":"Code and models are released at:\n https://maverickren.github.io/VideoWorld.github.io/"},{"id":"http://arxiv.org/abs/2210.09604v3","updated":"2025-03-05T14:43:59Z","published":"2022-10-18T05:34:58Z","title":"Perceptual Multi-Exposure Fusion","summary":" As an ever-increasing demand for high dynamic range (HDR) scene shooting,\nmulti-exposure image fusion (MEF) technology has abounded. In recent years,\nmulti-scale exposure fusion approaches based on detail-enhancement have led the\nway for improvement in highlight and shadow details. Most of such methods,\nhowever, are too computationally expensive to be deployed on mobile devices.\nThis paper presents a perceptual multi-exposure fusion method that not just\nensures fine shadow/highlight details but with lower complexity than\ndetailenhanced methods. We analyze the potential defects of three classical\nexposure measures in lieu of using detail-enhancement component and improve two\nof them, namely adaptive Wellexposedness (AWE) and the gradient of color images\n(3-D gradient). AWE designed in YCbCr color space considers the difference\nbetween varying exposure images. 3-D gradient is employed to extract fine\ndetails. We build a large-scale multiexposure benchmark dataset suitable for\nstatic scenes, which contains 167 image sequences all told. Experiments on the\nconstructed dataset demonstrate that the proposed method exceeds existing eight\nstate-of-the-art approaches in terms of visually and MEF-SSIM value. Moreover,\nour approach can achieve a better improvement for current image enhancement\ntechniques, ensuring fine detail in bright light.\n","authors":["Xiaoning Liu"],"pdf_url":"https://arxiv.org/pdf/2210.09604v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.07260v2","updated":"2025-03-05T14:40:41Z","published":"2024-12-10T07:42:02Z","title":"DFREC: DeepFake Identity Recovery Based on Identity-aware Masked\n Autoencoder","summary":" Recent advances in deepfake forensics have primarily focused on improving the\nclassification accuracy and generalization performance. Despite enormous\nprogress in detection accuracy across a wide variety of forgery algorithms,\nexisting algorithms lack intuitive interpretability and identity traceability\nto help with forensic investigation. In this paper, we introduce a novel\nDeepFake Identity Recovery scheme (DFREC) to fill this gap. DFREC aims to\nrecover the pair of source and target faces from a deepfake image to facilitate\ndeepfake identity tracing and reduce the risk of deepfake attack. It comprises\nthree key components: an Identity Segmentation Module (ISM), a Source Identity\nReconstruction Module (SIRM), and a Target Identity Reconstruction Module\n(TIRM). The ISM segments the input face into distinct source and target face\ninformation, and the SIRM reconstructs the source face and extracts latent\ntarget identity features with the segmented source information. The background\ncontext and latent target identity features are synergetically fused by a\nMasked Autoencoder in the TIRM to reconstruct the target face. We evaluate\nDFREC on six different high-fidelity face-swapping attacks on FaceForensics++,\nCelebaMegaFS and FFHQ-E4S datasets, which demonstrate its superior recovery\nperformance over state-of-the-art deepfake recovery algorithms. In addition,\nDFREC is the only scheme that can recover both pristine source and target faces\ndirectly from the forgery image with high fadelity.\n","authors":["Peipeng Yu","Hui Gao","Jianwei Fei","Zhitao Huang","Zhihua Xia","Chip-Hong Chang"],"pdf_url":"https://arxiv.org/pdf/2412.07260v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03548v1","updated":"2025-03-05T14:32:32Z","published":"2025-03-05T14:32:32Z","title":"Simulation-Based Performance Evaluation of 3D Object Detection Methods\n with Deep Learning for a LiDAR Point Cloud Dataset in a SOTIF-related Use\n Case","summary":" Safety of the Intended Functionality (SOTIF) addresses sensor performance\nlimitations and deep learning-based object detection insufficiencies to ensure\nthe intended functionality of Automated Driving Systems (ADS). This paper\npresents a methodology examining the adaptability and performance evaluation of\nthe 3D object detection methods on a LiDAR point cloud dataset generated by\nsimulating a SOTIF-related Use Case. The major contributions of this paper\ninclude defining and modelling a SOTIF-related Use Case with 21 diverse weather\nconditions and generating a LiDAR point cloud dataset suitable for application\nof 3D object detection methods. The dataset consists of 547 frames,\nencompassing clear, cloudy, rainy weather conditions, corresponding to\ndifferent times of the day, including noon, sunset, and night. Employing\nMMDetection3D and OpenPCDET toolkits, the performance of State-of-the-Art\n(SOTA) 3D object detection methods is evaluated and compared by testing the\npre-trained Deep Learning (DL) models on the generated dataset using Average\nPrecision (AP) and Recall metrics.\n","authors":["Milin Patel","Rolf Jung"],"pdf_url":"https://arxiv.org/pdf/2503.03548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13335v2","updated":"2025-03-05T14:32:31Z","published":"2025-01-23T02:31:57Z","title":"Deblur-Avatar: Animatable Avatars from Motion-Blurred Monocular Videos","summary":" We introduce a novel framework for modeling high-fidelity, animatable 3D\nhuman avatars from motion-blurred monocular video inputs. Motion blur is\nprevalent in real-world dynamic video capture, especially due to human\nmovements in 3D human avatar modeling. Existing methods either (1) assume sharp\nimage inputs, failing to address the detail loss introduced by motion blur, or\n(2) mainly consider blur by camera movements, neglecting the human motion blur\nwhich is more common in animatable avatars. Our proposed approach integrates a\nhuman movement-based motion blur model into 3D Gaussian Splatting (3DGS). By\nexplicitly modeling human motion trajectories during exposure time, we jointly\noptimize the trajectories and 3D Gaussians to reconstruct sharp, high-quality\nhuman avatars. We employ a pose-dependent fusion mechanism to distinguish\nmoving body regions, optimizing both blurred and sharp areas effectively.\nExtensive experiments on synthetic and real-world datasets demonstrate that our\nmethod significantly outperforms existing methods in rendering quality and\nquantitative metrics, producing sharp avatar reconstructions and enabling\nreal-time rendering under challenging motion blur conditions.\n","authors":["Xianrui Luo","Juewen Peng","Zhongang Cai","Lei Yang","Fan Yang","Zhiguo Cao","Guosheng Lin"],"pdf_url":"https://arxiv.org/pdf/2501.13335v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03543v1","updated":"2025-03-05T14:28:01Z","published":"2025-03-05T14:28:01Z","title":"A self-supervised cyclic neural-analytic approach for novel view\n synthesis and 3D reconstruction","summary":" Generating novel views from recorded videos is crucial for enabling\nautonomous UAV navigation. Recent advancements in neural rendering have\nfacilitated the rapid development of methods capable of rendering new\ntrajectories. However, these methods often fail to generalize well to regions\nfar from the training data without an optimized flight path, leading to\nsuboptimal reconstructions. We propose a self-supervised cyclic neural-analytic\npipeline that combines high-quality neural rendering outputs with precise\ngeometric insights from analytical methods. Our solution improves RGB and mesh\nreconstructions for novel view synthesis, especially in undersampled areas and\nregions that are completely different from the training dataset. We use an\neffective transformer-based architecture for image reconstruction to refine and\nadapt the synthesis process, enabling effective handling of novel, unseen poses\nwithout relying on extensive labeled datasets. Our findings demonstrate\nsubstantial improvements in rendering views of novel and also 3D\nreconstruction, which to the best of our knowledge is a first, setting a new\nstandard for autonomous navigation in complex outdoor environments.\n","authors":["Dragos Costea","Alina Marcu","Marius Leordeanu"],"pdf_url":"https://arxiv.org/pdf/2503.03543v1.pdf","comment":"Published in BMVC 2024, 10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2503.02394v2","updated":"2025-03-05T14:25:37Z","published":"2025-03-04T08:35:01Z","title":"BHViT: Binarized Hybrid Vision Transformer","summary":" Model binarization has made significant progress in enabling real-time and\nenergy-efficient computation for convolutional neural networks (CNN), offering\na potential solution to the deployment challenges faced by Vision Transformers\n(ViTs) on edge devices. However, due to the structural differences between CNN\nand Transformer architectures, simply applying binary CNN strategies to the ViT\nmodels will lead to a significant performance drop. To tackle this challenge,\nwe propose BHViT, a binarization-friendly hybrid ViT architecture and its full\nbinarization model with the guidance of three important observations.\nInitially, BHViT utilizes the local information interaction and hierarchical\nfeature aggregation technique from coarse to fine levels to address redundant\ncomputations stemming from excessive tokens. Then, a novel module based on\nshift operations is proposed to enhance the performance of the binary\nMultilayer Perceptron (MLP) module without significantly increasing\ncomputational overhead. In addition, an innovative attention matrix\nbinarization method based on quantization decomposition is proposed to evaluate\nthe token's importance in the binarized attention matrix. Finally, we propose a\nregularization loss to address the inadequate optimization caused by the\nincompatibility between the weight oscillation in the binary layers and the\nAdam Optimizer. Extensive experimental results demonstrate that our proposed\nalgorithm achieves SOTA performance among binary ViT methods.\n","authors":["Tian Gao","Zhiyuan Zhang","Yu Zhang","Huajun Liu","Kaijie Yin","Chengzhong Xu","Hui Kong"],"pdf_url":"https://arxiv.org/pdf/2503.02394v2.pdf","comment":"Accepted by CVPR2025"},{"id":"http://arxiv.org/abs/2503.03535v1","updated":"2025-03-05T14:18:39Z","published":"2025-03-05T14:18:39Z","title":"Unified Human Localization and Trajectory Prediction with Monocular\n Vision","summary":" Conventional human trajectory prediction models rely on clean curated data,\nrequiring specialized equipment or manual labeling, which is often impractical\nfor robotic applications. The existing predictors tend to overfit to clean\nobservation affecting their robustness when used with noisy inputs. In this\nwork, we propose MonoTransmotion (MT), a Transformer-based framework that uses\nonly a monocular camera to jointly solve localization and prediction tasks. Our\nframework has two main modules: Bird's Eye View (BEV) localization and\ntrajectory prediction. The BEV localization module estimates the position of a\nperson using 2D human poses, enhanced by a novel directional loss for smoother\nsequential localizations. The trajectory prediction module predicts future\nmotion from these estimates. We show that by jointly training both tasks with\nour unified framework, our method is more robust in real-world scenarios made\nof noisy inputs. We validate our MT network on both curated and non-curated\ndatasets. On the curated dataset, MT achieves around 12% improvement over\nbaseline models on BEV localization and trajectory prediction. On real-world\nnon-curated dataset, experimental results indicate that MT maintains similar\nperformance levels, highlighting its robustness and generalization capability.\nThe code is available at https://github.com/vita-epfl/MonoTransmotion.\n","authors":["Po-Chien Luan","Yang Gao","Celine Demonsant","Alexandre Alahi"],"pdf_url":"https://arxiv.org/pdf/2503.03535v1.pdf","comment":"ICRA 2025"},{"id":"http://arxiv.org/abs/2411.02951v2","updated":"2025-03-05T14:16:27Z","published":"2024-11-05T09:51:59Z","title":"LDPM: Towards undersampled MRI reconstruction with MR-VAE and Latent\n Diffusion Prior","summary":" Diffusion models, as powerful generative models, have found a wide range of\napplications and shown great potential in solving image reconstruction\nproblems. Some works attempted to solve MRI reconstruction with diffusion\nmodels, but these methods operate directly in pixel space, leading to higher\ncomputational costs for optimization and inference. Latent diffusion models,\npre-trained on natural images with rich visual priors, are expected to solve\nthe high computational cost problem in MRI reconstruction by operating in a\nlower-dimensional latent space. However, direct application to MRI\nreconstruction faces three key challenges: (1) absence of explicit control\nmechanisms for medical fidelity, (2) domain gap between natural images and MR\nphysics, and (3) undefined data consistency in latent space. To address these\nchallenges, a novel Latent Diffusion Prior-based undersampled MRI\nreconstruction (LDPM) method is proposed. Our LDPM framework addresses these\nchallenges by: (1) a sketch-guided pipeline with a two-step reconstruction\nstrategy, which balances perceptual quality and anatomical fidelity, (2) an\nMRI-optimized VAE (MR-VAE), which achieves an improvement of approximately 3.92\ndB in PSNR for undersampled MRI reconstruction compared to that with SD-VAE\n\\cite{sd}, and (3) Dual-Stage Sampler, a modified version of spaced DDPM\nsampler, which enforces high-fidelity reconstruction in the latent space.\nExperiments on the fastMRI dataset\\cite{fastmri} demonstrate the\nstate-of-the-art performance of the proposed method and its robustness across\nvarious scenarios. The effectiveness of each module is also verified through\nablation experiments.\n","authors":["Xingjian Tang","Jingwei Guan","Linge Li","Ran Shi","Youmei Zhang","Mengye Lyu","Li Yan"],"pdf_url":"https://arxiv.org/pdf/2411.02951v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16502v2","updated":"2025-03-05T14:11:44Z","published":"2024-09-24T23:18:32Z","title":"GSplatLoc: Grounding Keypoint Descriptors into 3D Gaussian Splatting for\n Improved Visual Localization","summary":" Although various visual localization approaches exist, such as scene\ncoordinate regression and camera pose regression, these methods often struggle\nwith optimization complexity or limited accuracy. To address these challenges,\nwe explore the use of novel view synthesis techniques, particularly 3D Gaussian\nSplatting (3DGS), which enables the compact encoding of both 3D geometry and\nscene appearance. We propose a two-stage procedure that integrates dense and\nrobust keypoint descriptors from the lightweight XFeat feature extractor into\n3DGS, enhancing performance in both indoor and outdoor environments. The coarse\npose estimates are directly obtained via 2D-3D correspondences between the 3DGS\nrepresentation and query image descriptors. In the second stage, the initial\npose estimate is refined by minimizing the rendering-based photometric warp\nloss. Benchmarking on widely used indoor and outdoor datasets demonstrates\nimprovements over recent neural rendering-based localization methods, such as\nNeRFMatch and PNeRFLoc.\n","authors":["Gennady Sidorov","Malik Mohrat","Denis Gridusov","Ruslan Rakhimov","Sergey Kolyubin"],"pdf_url":"https://arxiv.org/pdf/2409.16502v2.pdf","comment":"Project website at https://gsplatloc.github.io/"},{"id":"http://arxiv.org/abs/2503.03528v1","updated":"2025-03-05T14:11:13Z","published":"2025-03-05T14:11:13Z","title":"AdaSin: Enhancing Hard Sample Metrics with Dual Adaptive Penalty for\n Face Recognition","summary":" In recent years, the emergence of deep convolutional neural networks has\npositioned face recognition as a prominent research focus in computer vision.\nTraditional loss functions, such as margin-based, hard-sample mining-based, and\nhybrid approaches, have achieved notable performance improvements, with some\nleveraging curriculum learning to optimize training. However, these methods\noften fall short in effectively quantifying the difficulty of hard samples. To\naddress this, we propose Adaptive Sine (AdaSin) loss function, which introduces\nthe sine of the angle between a sample's embedding feature and its ground-truth\nclass center as a novel difficulty metric. This metric enables precise and\neffective penalization of hard samples. By incorporating curriculum learning,\nthe model dynamically adjusts classification boundaries across different\ntraining stages. Unlike previous adaptive-margin loss functions, AdaSin\nintroduce a dual adaptive penalty, applied to both the positive and negative\ncosine similarities of hard samples. This design imposes stronger constraints,\nenhancing intra-class compactness and inter-class separability. The combination\nof the dual adaptive penalty and curriculum learning is guided by a\nwell-designed difficulty metric. It enables the model to focus more effectively\non hard samples in later training stages, and lead to the extraction of highly\ndiscriminative face features. Extensive experiments across eight benchmarks\ndemonstrate that AdaSin achieves superior accuracy compared to other\nstate-of-the-art methods.\n","authors":["Qiqi Guo","Zhuowen Zheng","Guanghua Yang","Zhiquan Liu","Xiaofan Li","Jianqing Li","Jinyu Tian","Xueyuan Gong"],"pdf_url":"https://arxiv.org/pdf/2503.03528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.18783v2","updated":"2025-03-05T14:11:02Z","published":"2024-12-25T05:19:52Z","title":"ArtNVG: Content-Style Separated Artistic Neighboring-View Gaussian\n Stylization","summary":" As demand from the film and gaming industries for 3D scenes with target\nstyles grows, the importance of advanced 3D stylization techniques increases.\nHowever, recent methods often struggle to maintain local consistency in color\nand texture throughout stylized scenes, which is essential for maintaining\naesthetic coherence. To solve this problem, this paper introduces ArtNVG, an\ninnovative 3D stylization framework that efficiently generates stylized 3D\nscenes by leveraging reference style images. Built on 3D Gaussian Splatting\n(3DGS), ArtNVG achieves rapid optimization and rendering while upholding high\nreconstruction quality. Our framework realizes high-quality 3D stylization by\nincorporating two pivotal techniques: Content-Style Separated Control and\nAttention-based Neighboring-View Alignment. Content-Style Separated Control\nuses the CSGO model and the Tile ControlNet to decouple the content and style\ncontrol, reducing risks of information leakage. Concurrently, Attention-based\nNeighboring-View Alignment ensures consistency of local colors and textures\nacross neighboring views, significantly improving visual quality. Extensive\nexperiments validate that ArtNVG surpasses existing methods, delivering\nsuperior results in content preservation, style alignment, and local\nconsistency.\n","authors":["Zixiao Gu","Mengtian Li","Ruhua Chen","Zhongxia Ji","Sichen Guo","Zhenye Zhang","Guangnan Ye","Zuo Hu"],"pdf_url":"https://arxiv.org/pdf/2412.18783v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03519v1","updated":"2025-03-05T14:03:34Z","published":"2025-03-05T14:03:34Z","title":"Do ImageNet-trained models learn shortcuts? The impact of frequency\n shortcuts on generalization","summary":" Frequency shortcuts refer to specific frequency patterns that models heavily\nrely on for correct classification. Previous studies have shown that models\ntrained on small image datasets often exploit such shortcuts, potentially\nimpairing their generalization performance. However, existing methods for\nidentifying frequency shortcuts require expensive computations and become\nimpractical for analyzing models trained on large datasets. In this work, we\npropose the first approach to more efficiently analyze frequency shortcuts at a\nlarger scale. We show that both CNN and transformer models learn frequency\nshortcuts on ImageNet. We also expose that frequency shortcut solutions can\nyield good performance on out-of-distribution (OOD) test sets which largely\nretain texture information. However, these shortcuts, mostly aligned with\ntexture patterns, hinder model generalization on rendition-based OOD test sets.\nThese observations suggest that current OOD evaluations often overlook the\nimpact of frequency shortcuts on model generalization. Future benchmarks could\nthus benefit from explicitly assessing and accounting for these shortcuts to\nbuild models that generalize across a broader range of OOD scenarios.\n","authors":["Shunxin Wang","Raymond Veldhuis","Nicola Strisciuglio"],"pdf_url":"https://arxiv.org/pdf/2503.03519v1.pdf","comment":"received at CVPR2025"},{"id":"http://arxiv.org/abs/2402.09444v3","updated":"2025-03-05T14:02:10Z","published":"2024-01-31T15:37:12Z","title":"Multimodal Action Quality Assessment","summary":" Action quality assessment (AQA) is to assess how well an action is performed.\nPrevious works perform modelling by only the use of visual information,\nignoring audio information. We argue that although AQA is highly dependent on\nvisual information, the audio is useful complementary information for improving\nthe score regression accuracy, especially for sports with background music,\nsuch as figure skating and rhythmic gymnastics. To leverage multimodal\ninformation for AQA, i.e., RGB, optical flow and audio information, we propose\na Progressive Adaptive Multimodal Fusion Network (PAMFN) that separately models\nmodality-specific information and mixed-modality information. Our model\nconsists of with three modality-specific branches that independently explore\nmodality-specific information and a mixed-modality branch that progressively\naggregates the modality-specific information from the modality-specific\nbranches. To build the bridge between modality-specific branches and the\nmixed-modality branch, three novel modules are proposed. First, a\nModality-specific Feature Decoder module is designed to selectively transfer\nmodality-specific information to the mixed-modality branch. Second, when\nexploring the interaction between modality-specific information, we argue that\nusing an invariant multimodal fusion policy may lead to suboptimal results, so\nas to take the potential diversity in different parts of an action into\nconsideration. Therefore, an Adaptive Fusion Module is proposed to learn\nadaptive multimodal fusion policies in different parts of an action. This\nmodule consists of several FusionNets for exploring different multimodal fusion\nstrategies and a PolicyNet for deciding which FusionNets are enabled. Third, a\nmodule called Cross-modal Feature Decoder is designed to transfer cross-modal\nfeatures generated by Adaptive Fusion Module to the mixed-modality branch.\n","authors":["Ling-An Zeng","Wei-Shi Zheng"],"pdf_url":"https://arxiv.org/pdf/2402.09444v3.pdf","comment":"IEEE Transactions on Image Processing 2024"},{"id":"http://arxiv.org/abs/2503.03507v1","updated":"2025-03-05T13:55:26Z","published":"2025-03-05T13:55:26Z","title":"Mineral segmentation using electron microscope images and spectral\n sampling through multimodal graph neural networks","summary":" We propose a novel Graph Neural Network-based method for segmentation based\non data fusion of multimodal Scanning Electron Microscope (SEM) images. In most\ncases, Backscattered Electron (BSE) images obtained using SEM do not contain\nsufficient information for mineral segmentation. Therefore, imaging is often\ncomplemented with point-wise Energy-Dispersive X-ray Spectroscopy (EDS)\nspectral measurements that provide highly accurate information about the\nchemical composition but that are time-consuming to acquire. This motivates the\nuse of sparse spectral data in conjunction with BSE images for mineral\nsegmentation. The unstructured nature of the spectral data makes most\ntraditional image fusion techniques unsuitable for BSE-EDS fusion. We propose\nusing graph neural networks to fuse the two modalities and segment the mineral\nphases simultaneously. Our results demonstrate that providing EDS data for as\nfew as 1% of BSE pixels produces accurate segmentation, enabling rapid analysis\nof mineral samples. The proposed data fusion pipeline is versatile and can be\nadapted to other domains that involve image data and point-wise measurements.\n","authors":["Samuel Repka","Bořek Reich","Fedor Zolotarev","Tuomas Eerola","Pavel Zemčík"],"pdf_url":"https://arxiv.org/pdf/2503.03507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03501v1","updated":"2025-03-05T13:47:02Z","published":"2025-03-05T13:47:02Z","title":"CarGait: Cross-Attention based Re-ranking for Gait recognition","summary":" Gait recognition is a computer vision task that identifies individuals based\non their walking patterns. Gait recognition performance is commonly evaluated\nby ranking a gallery of candidates and measuring the accuracy at the top\nRank-$K$. Existing models are typically single-staged, i.e. searching for the\nprobe's nearest neighbors in a gallery using a single global feature\nrepresentation. Although these models typically excel at retrieving the correct\nidentity within the top-$K$ predictions, they struggle when hard negatives\nappear in the top short-list, leading to relatively low performance at the\nhighest ranks (e.g., Rank-1). In this paper, we introduce CarGait, a\nCross-Attention Re-ranking method for gait recognition, that involves\nre-ordering the top-$K$ list leveraging the fine-grained correlations between\npairs of gait sequences through cross-attention between gait strips. This\nre-ranking scheme can be adapted to existing single-stage models to enhance\ntheir final results. We demonstrate the capabilities of CarGait by extensive\nexperiments on three common gait datasets, Gait3D, GREW, and OU-MVLP, and seven\ndifferent gait models, showing consistent improvements in Rank-1,5 accuracy,\nsuperior results over existing re-ranking methods, and strong baselines.\n","authors":["Gavriel Habib","Noa Barzilay","Or Shimshi","Rami Ben-Ari","Nir Darshan"],"pdf_url":"https://arxiv.org/pdf/2503.03501v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02077v4","updated":"2025-03-05T13:43:07Z","published":"2024-05-03T13:10:16Z","title":"MVP-Shot: Multi-Velocity Progressive-Alignment Framework for Few-Shot\n Action Recognition","summary":" Recent few-shot action recognition (FSAR) methods typically perform semantic\nmatching on learned discriminative features to achieve promising performance.\nHowever, most FSAR methods focus on single-scale (e.g., frame-level,\nsegment-level, etc) feature alignment, which ignores that human actions with\nthe same semantic may appear at different velocities. To this end, we develop a\nnovel Multi-Velocity Progressive-alignment (MVP-Shot) framework to\nprogressively learn and align semantic-related action features at\nmulti-velocity levels. Concretely, a Multi-Velocity Feature Alignment (MVFA)\nmodule is designed to measure the similarity between features from support and\nquery videos with different velocity scales and then merge all similarity\nscores in a residual fashion. To avoid the multiple velocity features deviating\nfrom the underlying motion semantic, our proposed Progressive Semantic-Tailored\nInteraction (PSTI) module injects velocity-tailored text information into the\nvideo feature via feature interaction on channel and temporal domains at\ndifferent velocities. The above two modules compensate for each other to make\nmore accurate query sample predictions under the few-shot settings.\nExperimental results show our method outperforms current state-of-the-art\nmethods on multiple standard few-shot benchmarks (i.e., HMDB51, UCF101,\nKinetics, and SSv2-small).\n","authors":["Hongyu Qu","Rui Yan","Xiangbo Shu","Hailiang Gao","Peng Huang","Guo-Sen Xie"],"pdf_url":"https://arxiv.org/pdf/2405.02077v4.pdf","comment":"Accepted to TMM 2025"},{"id":"http://arxiv.org/abs/2402.12185v5","updated":"2025-03-05T13:41:21Z","published":"2024-02-19T14:48:23Z","title":"ChartX & ChartVLM: A Versatile Benchmark and Foundation Model for\n Complicated Chart Reasoning","summary":" Recently, many versatile Multi-modal Large Language Models (MLLMs) have\nemerged continuously. However, their capacity to query information depicted in\nvisual charts and engage in reasoning based on the queried contents remains\nunder-explored. In this paper, to comprehensively and rigorously benchmark the\nability of the off-the-shelf MLLMs in the chart domain, we construct ChartX, a\nmulti-modal evaluation set covering 18 chart types, 7 chart tasks, 22\ndisciplinary topics, and high-quality chart data. Besides, we develop ChartVLM\nto offer a new perspective on handling multi-modal tasks that strongly depend\non interpretable patterns, such as reasoning tasks in the field of charts or\ngeometric images. We evaluate the chart-related ability of mainstream MLLMs and\nour ChartVLM on the proposed ChartX evaluation set. Extensive experiments\ndemonstrate that ChartVLM surpasses both versatile and chart-related large\nmodels, achieving results comparable to GPT-4V. We believe that our study can\npave the way for further exploration in creating a more comprehensive chart\nevaluation set and developing more interpretable multi-modal models. Both\nChartX and ChartVLM are available at:\nhttps://github.com/Alpha-Innovator/ChartVLM\n","authors":["Renqiu Xia","Bo Zhang","Hancheng Ye","Xiangchao Yan","Qi Liu","Hongbin Zhou","Zijun Chen","Peng Ye","Min Dou","Botian Shi","Junchi Yan","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2402.12185v5.pdf","comment":"Code and dataset are available for downloading at:\n https://github.com/Alpha-Innovator/ChartVLM 26 pages, 15 figures"},{"id":"http://arxiv.org/abs/2503.03492v1","updated":"2025-03-05T13:32:49Z","published":"2025-03-05T13:32:49Z","title":"Find First, Track Next: Decoupling Identification and Propagation in\n Referring Video Object Segmentation","summary":" Referring video object segmentation aims to segment and track a target object\nin a video using a natural language prompt. Existing methods typically fuse\nvisual and textual features in a highly entangled manner, processing\nmulti-modal information together to generate per-frame masks. However, this\napproach often struggles with ambiguous target identification, particularly in\nscenes with multiple similar objects, and fails to ensure consistent mask\npropagation across frames. To address these limitations, we introduce\nFindTrack, a novel decoupled framework that separates target identification\nfrom mask propagation. FindTrack first adaptively selects a key frame by\nbalancing segmentation confidence and vision-text alignment, establishing a\nrobust reference for the target object. This reference is then utilized by a\ndedicated propagation module to track and segment the object across the entire\nvideo. By decoupling these processes, FindTrack effectively reduces ambiguities\nin target association and enhances segmentation consistency. We demonstrate\nthat FindTrack outperforms existing methods on public benchmarks.\n","authors":["Suhwan Cho","Seunghoon Lee","Minhyeok Lee","Jungho Lee","Sangyoun Lee"],"pdf_url":"https://arxiv.org/pdf/2503.03492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11098v4","updated":"2025-03-05T13:28:29Z","published":"2024-04-17T06:32:42Z","title":"LAPTOP-Diff: Layer Pruning and Normalized Distillation for Compressing\n Diffusion Models","summary":" In the era of AIGC, the demand for low-budget or even on-device applications\nof diffusion models emerged. In terms of compressing the Stable Diffusion\nmodels (SDMs), several approaches have been proposed, and most of them\nleveraged the handcrafted layer removal methods to obtain smaller U-Nets, along\nwith knowledge distillation to recover the network performance. However, such a\nhandcrafting manner of layer removal is inefficient and lacks scalability and\ngeneralization, and the feature distillation employed in the retraining phase\nfaces an imbalance issue that a few numerically significant feature loss terms\ndominate over others throughout the retraining process. To this end, we\nproposed the layer pruning and normalized distillation for compressing\ndiffusion models (LAPTOP-Diff). We, 1) introduced the layer pruning method to\ncompress SDM's U-Net automatically and proposed an effective one-shot pruning\ncriterion whose one-shot performance is guaranteed by its good additivity\nproperty, surpassing other layer pruning and handcrafted layer removal methods,\n2) proposed the normalized feature distillation for retraining, alleviated the\nimbalance issue. Using the proposed LAPTOP-Diff, we compressed the U-Nets of\nSDXL and SDM-v1.5 for the most advanced performance, achieving a minimal 4.0%\ndecline in PickScore at a pruning ratio of 50% while the comparative methods'\nminimal PickScore decline is 8.2%.\n","authors":["Dingkun Zhang","Sijia Li","Chen Chen","Qingsong Xie","Haonan Lu"],"pdf_url":"https://arxiv.org/pdf/2404.11098v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.18355v2","updated":"2025-03-05T13:25:09Z","published":"2024-12-24T11:35:40Z","title":"Handling Spatial-Temporal Data Heterogeneity for Federated Continual\n Learning via Tail Anchor","summary":" Federated continual learning (FCL) allows each client to continually update\nits knowledge from task streams, enhancing the applicability of federated\nlearning in real-world scenarios. However, FCL needs to address not only\nspatial data heterogeneity between clients but also temporal data heterogeneity\nbetween tasks. In this paper, empirical experiments demonstrate that such\ninput-level heterogeneity significantly affects the model's internal parameters\nand outputs, leading to severe spatial-temporal catastrophic forgetting of\nlocal and previous knowledge. To this end, we propose Federated Tail Anchor\n(FedTA) to mix trainable Tail Anchor with the frozen output features to adjust\ntheir position in the feature space, thereby overcoming parameter-forgetting\nand output-forgetting. Three novel components are also included: Input\nEnhancement for improving the performance of pre-trained models on downstream\ntasks; Selective Input Knowledge Fusion for fusion of heterogeneous local\nknowledge on the server; and Best Global Prototype Selection for finding the\nbest anchor point for each class in the feature space. Extensive experiments\ndemonstrate that FedTA not only outperforms existing FCL methods but also\neffectively preserves the relative positions of features.\n","authors":["Hao Yu","Xin Yang","Le Zhang","Hanlin Gu","Tianrui Li","Lixin Fan","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2412.18355v2.pdf","comment":"This paper is accepted by CVPR 2025"},{"id":"http://arxiv.org/abs/2503.03479v1","updated":"2025-03-05T13:16:26Z","published":"2025-03-05T13:16:26Z","title":"Feature Point Extraction for Extra-Affine Image","summary":" The issue concerning the significant decline in the stability of feature\nextraction for images subjected to large-angle affine transformations, where\nthe angle exceeds 50 degrees, still awaits a satisfactory solution. Even ASIFT,\nwhich is built upon SIFT and entails a considerable number of image comparisons\nsimulated by affine transformations, inevitably exhibits the drawbacks of being\ntime-consuming and imposing high demands on memory usage. And the stability of\nfeature extraction drops rapidly under large-view affine transformations.\nConsequently, we propose a method that represents an improvement over ASIFT. On\nthe premise of improving the precision and maintaining the affine invariance,\nit currently ranks as the fastest feature extraction method for extra-affine\nimages that we know of at present. Simultaneously, the stability of feature\nextraction regarding affine transformation images has been approximated to the\nmaximum limits. Both the angle between the shooting direction and the normal\ndirection of the photographed object (absolute tilt angle), and the shooting\ntransformation angle between two images (transition tilt angle) are close to 90\ndegrees. The central idea of the method lies in obtaining the optimal parameter\nset by simulating affine transformation with the reference image. And the\nsimulated affine transformation is reproduced by combining it with the Lanczos\ninterpolation based on the optimal parameter set. Subsequently, it is combined\nwith ORB, which exhibits excellent real-time performance for rapid orientation\nbinary description. Moreover, a scale parameter simulation is introduced to\nfurther augment the operational efficiency.\n","authors":["Tao Wang","Yinghui Wang","Yanxing Liang","Liangyi Huang","Jinlong Yang","Wei Li","Xiaojuan Ning"],"pdf_url":"https://arxiv.org/pdf/2503.03479v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03475v1","updated":"2025-03-05T13:10:11Z","published":"2025-03-05T13:10:11Z","title":"Bridging Synthetic-to-Real Gaps: Frequency-Aware Perturbation and\n Selection for Single-shot Multi-Parametric Mapping Reconstruction","summary":" Data-centric artificial intelligence (AI) has remarkably advanced medical\nimaging, with emerging methods using synthetic data to address data scarcity\nwhile introducing synthetic-to-real gaps. Unsupervised domain adaptation (UDA)\nshows promise in ground truth-scarce tasks, but its application in\nreconstruction remains underexplored. Although multiple overlapping-echo\ndetachment (MOLED) achieves ultra-fast multi-parametric reconstruction,\nextending its application to various clinical scenarios, the quality suffers\nfrom deficiency in mitigating the domain gap, difficulty in maintaining\nstructural integrity, and inadequacy in ensuring mapping accuracy. To resolve\nthese issues, we proposed frequency-aware perturbation and selection (FPS),\ncomprising Wasserstein distance-modulated frequency-aware perturbation (WDFP)\nand hierarchical frequency-aware selection network (HFSNet), which integrates\nfrequency-aware adaptive selection (FAS), compact FAS (cFAS) and feature-aware\narchitecture integration (FAI). Specifically, perturbation activates\ndomain-invariant feature learning within uncertainty, while selection refines\noptimal solutions within perturbation, establishing a robust and closed-loop\nlearning pathway. Extensive experiments on synthetic data, along with diverse\nreal clinical cases from 5 healthy volunteers, 94 ischemic stroke patients, and\n46 meningioma patients, demonstrate the superiority and clinical applicability\nof FPS. Furthermore, FPS is applied to diffusion tensor imaging (DTI),\nunderscoring its versatility and potential for broader medical applications.\nThe code is available at https://github.com/flyannie/FPS.\n","authors":["Linyu Fan","Che Wang","Ming Ye","Qizhi Yang","Zejun Wu","Xinghao Ding","Yue Huang","Jianfeng Bao","Shuhui Cai","Congbo Cai"],"pdf_url":"https://arxiv.org/pdf/2503.03475v1.pdf","comment":"This work will be submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2503.03465v1","updated":"2025-03-05T12:56:33Z","published":"2025-03-05T12:56:33Z","title":"DTU-Net: A Multi-Scale Dilated Transformer Network for Nonlinear\n Hyperspectral Unmixing","summary":" Transformers have shown significant success in hyperspectral unmixing (HU).\nHowever, challenges remain. While multi-scale and long-range spatial\ncorrelations are essential in unmixing tasks, current Transformer-based\nunmixing networks, built on Vision Transformer (ViT) or Swin-Transformer,\nstruggle to capture them effectively. Additionally, current Transformer-based\nunmixing networks rely on the linear mixing model, which lacks the flexibility\nto accommodate scenarios where nonlinear effects are significant. To address\nthese limitations, we propose a multi-scale Dilated Transformer-based unmixing\nnetwork for nonlinear HU (DTU-Net). The encoder employs two branches. The first\none performs multi-scale spatial feature extraction using Multi-Scale Dilated\nAttention (MSDA) in the Dilated Transformer, which varies dilation rates across\nattention heads to capture long-range and multi-scale spatial correlations. The\nsecond one performs spectral feature extraction utilizing 3D-CNNs with channel\nattention. The outputs from both branches are then fused to integrate\nmulti-scale spatial and spectral information, which is subsequently transformed\nto estimate the abundances. The decoder is designed to accommodate both linear\nand nonlinear mixing scenarios. Its interpretability is enhanced by explicitly\nmodeling the relationships between endmembers, abundances, and nonlinear\ncoefficients in accordance with the polynomial post-nonlinear mixing model\n(PPNMM). Experiments on synthetic and real datasets validate the effectiveness\nof the proposed DTU-Net compared to PPNMM-derived methods and several advanced\nunmixing networks.\n","authors":["ChenTong Wang","Jincheng Gao","Fei Zhu","Abderrahim Halimi","C'edric Richard"],"pdf_url":"https://arxiv.org/pdf/2503.03465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10860v2","updated":"2025-03-05T12:41:05Z","published":"2024-03-16T08:57:00Z","title":"Sim2Real within 5 Minutes: Efficient Domain Transfer with Stylized\n Gaussian Splatting for Endoscopic Images","summary":" Robot assisted endoluminal intervention is an emerging technique for both\nbenign and malignant luminal lesions. With vision-based navigation, when\ncombined with pre-operative imaging data as priors, it is possible to recover\nposition and pose of the endoscope without the need of additional sensors. In\npractice, however, aligning pre-operative and intra-operative domains is\ncomplicated by significant texture differences. Although methods such as style\ntransfer can be used to address this issue, they require large datasets from\nboth source and target domains with prolonged training times. This paper\nproposes an efficient domain transfer method based on stylized Gaussian\nsplatting, only requiring a few of real images (10 images) with very fast\ntraining time. Specifically, the transfer process includes two phases. In the\nfirst phase, the 3D models reconstructed from CT scans are represented as\ndifferential Gaussian point clouds. In the second phase, only color appearance\nrelated parameters are optimized to transfer the style and preserve the visual\ncontent. A novel structure consistency loss is applied to latent features and\ndepth levels to enhance the stability of the transferred images. Detailed\nvalidation was performed to demonstrate the performance advantages of the\nproposed method compared to that of the current state-of-the-art, highlighting\nthe potential for intra-operative surgical navigation.\n","authors":["Junyang Wu","Yun Gu","Guang-Zhong Yang"],"pdf_url":"https://arxiv.org/pdf/2403.10860v2.pdf","comment":"Accepted by ICRA 2025"},{"id":"http://arxiv.org/abs/2503.03453v1","updated":"2025-03-05T12:35:54Z","published":"2025-03-05T12:35:54Z","title":"Active Learning for Deep Learning-Based Hemodynamic Parameter Estimation","summary":" Hemodynamic parameters such as pressure and wall shear stress play an\nimportant role in diagnosis, prognosis, and treatment planning in\ncardiovascular diseases. These parameters can be accurately computed using\ncomputational fluid dynamics (CFD), but CFD is computationally intensive.\nHence, deep learning methods have been adopted as a surrogate to rapidly\nestimate CFD outcomes. A drawback of such data-driven models is the need for\ntime-consuming reference CFD simulations for training. In this work, we\nintroduce an active learning framework to reduce the number of CFD simulations\nrequired for the training of surrogate models, lowering the barriers to their\ndeployment in new applications. We propose three distinct querying strategies\nto determine for which unlabeled samples CFD simulations should be obtained.\nThese querying strategies are based on geometrical variance, ensemble\nuncertainty, and adherence to the physics governing fluid dynamics. We\nbenchmark these methods on velocity field estimation in synthetic coronary\nartery bifurcations and find that they allow for substantial reductions in\nannotation cost. Notably, we find that our strategies reduce the number of\nsamples required by up to 50% and make the trained models more robust to\ndifficult cases. Our results show that active learning is a feasible strategy\nto increase the potential of deep learning-based CFD surrogates.\n","authors":["Patryk Rygiel","Julian Suk","Kak Khee Yeung","Christoph Brune","Jelmer M. Wolterink"],"pdf_url":"https://arxiv.org/pdf/2503.03453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.05503v3","updated":"2025-03-05T12:27:57Z","published":"2025-02-08T09:31:26Z","title":"A Physical Coherence Benchmark for Evaluating Video Generation Models\n via Optical Flow-guided Frame Prediction","summary":" Recent advances in video generation models demonstrate their potential as\nworld simulators, but they often struggle with videos deviating from physical\nlaws, a key concern overlooked by most text-to-video benchmarks. We introduce a\nbenchmark designed specifically to assess the Physical Coherence of generated\nvideos, PhyCoBench. Our benchmark includes 120 prompts covering 7 categories of\nphysical principles, capturing key physical laws observable in video content.\nWe evaluated four state-of-the-art (SoTA) T2V models on PhyCoBench and\nconducted manual assessments. Additionally, we propose an automated evaluation\nmodel: PhyCoPredictor, a diffusion model that generates optical flow and video\nframes in a cascade manner. Through a consistency evaluation comparing\nautomated and manual sorting, the experimental results show that PhyCoPredictor\ncurrently aligns most closely with human evaluation. Therefore, it can\neffectively evaluate the physical coherence of videos, providing insights for\nfuture model optimization. Our benchmark, including physical coherence prompts,\nthe automatic evaluation tool PhyCoPredictor, and the generated video dataset,\nhas been released on GitHub at https://github.com/Jeckinchen/PhyCoBench.\n","authors":["Yongfan Chen","Xiuwen Zhu","Tianyu Li"],"pdf_url":"https://arxiv.org/pdf/2502.05503v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03446v1","updated":"2025-03-05T12:25:22Z","published":"2025-03-05T12:25:22Z","title":"Biased Heritage: How Datasets Shape Models in Facial Expression\n Recognition","summary":" In recent years, the rapid development of artificial intelligence (AI)\nsystems has raised concerns about our ability to ensure their fairness, that\nis, how to avoid discrimination based on protected characteristics such as\ngender, race, or age. While algorithmic fairness is well-studied in simple\nbinary classification tasks on tabular data, its application to complex,\nreal-world scenarios-such as Facial Expression Recognition (FER)-remains\nunderexplored. FER presents unique challenges: it is inherently multiclass, and\nbiases emerge across intersecting demographic variables, each potentially\ncomprising multiple protected groups. We present a comprehensive framework to\nanalyze bias propagation from datasets to trained models in image-based FER\nsystems, while introducing new bias metrics specifically designed for\nmulticlass problems with multiple demographic groups. Our methodology studies\nbias propagation by (1) inducing controlled biases in FER datasets, (2)\ntraining models on these biased datasets, and (3) analyzing the correlation\nbetween dataset bias metrics and model fairness notions. Our findings reveal\nthat stereotypical biases propagate more strongly to model predictions than\nrepresentational biases, suggesting that preventing emotion-specific\ndemographic patterns should be prioritized over general demographic balance in\nFER datasets. Additionally, we observe that biased datasets lead to reduced\nmodel accuracy, challenging the assumed fairness-accuracy trade-off.\n","authors":["Iris Dominguez-Catena","Daniel Paternain","Mikel Galar","MaryBeth Defrance","Maarten Buyl","Tijl De Bie"],"pdf_url":"https://arxiv.org/pdf/2503.03446v1.pdf","comment":"17 pages, 7 figures"},{"id":"http://arxiv.org/abs/2503.03437v1","updated":"2025-03-05T12:12:51Z","published":"2025-03-05T12:12:51Z","title":"JamMa: Ultra-lightweight Local Feature Matching with Joint Mamba","summary":" Existing state-of-the-art feature matchers capture long-range dependencies\nwith Transformers but are hindered by high spatial complexity, leading to\ndemanding training and highlatency inference. Striking a better balance between\nperformance and efficiency remains a challenge in feature matching. Inspired by\nthe linear complexity O(N) of Mamba, we propose an ultra-lightweight\nMamba-based matcher, named JamMa, which converges on a single GPU and achieves\nan impressive performance-efficiency balance in inference. To unlock the\npotential of Mamba for feature matching, we propose Joint Mamba with a\nscan-merge strategy named JEGO, which enables: (1) Joint scan of two images to\nachieve high-frequency mutual interaction, (2) Efficient scan with skip steps\nto reduce sequence length, (3) Global receptive field, and (4) Omnidirectional\nfeature representation. With the above properties, the JEGO strategy\nsignificantly outperforms the scan-merge strategies proposed in VMamba and\nEVMamba in the feature matching task. Compared to attention-based sparse and\nsemi-dense matchers, JamMa demonstrates a superior balance between performance\nand efficiency, delivering better performance with less than 50% of the\nparameters and FLOPs.\n","authors":["Xiaoyong Lu","Songlin Du"],"pdf_url":"https://arxiv.org/pdf/2503.03437v1.pdf","comment":"CVPR 2025, Project page: https://leoluxxx.github.io/JamMa-page/"},{"id":"http://arxiv.org/abs/2503.03430v1","updated":"2025-03-05T12:02:04Z","published":"2025-03-05T12:02:04Z","title":"CoSDH: Communication-Efficient Collaborative Perception via\n Supply-Demand Awareness and Intermediate-Late Hybridization","summary":" Multi-agent collaborative perception enhances perceptual capabilities by\nutilizing information from multiple agents and is considered a fundamental\nsolution to the problem of weak single-vehicle perception in autonomous\ndriving. However, existing collaborative perception methods face a dilemma\nbetween communication efficiency and perception accuracy. To address this\nissue, we propose a novel communication-efficient collaborative perception\nframework based on supply-demand awareness and intermediate-late hybridization,\ndubbed as \\mymethodname. By modeling the supply-demand relationship between\nagents, the framework refines the selection of collaboration regions, reducing\nunnecessary communication cost while maintaining accuracy. In addition, we\ninnovatively introduce the intermediate-late hybrid collaboration mode, where\nlate-stage collaboration compensates for the performance degradation in\ncollaborative perception under low communication bandwidth. Extensive\nexperiments on multiple datasets, including both simulated and real-world\nscenarios, demonstrate that \\mymethodname~ achieves state-of-the-art detection\naccuracy and optimal bandwidth trade-offs, delivering superior detection\nprecision under real communication bandwidths, thus proving its effectiveness\nand practical applicability. The code will be released at\nhttps://github.com/Xu2729/CoSDH.\n","authors":["Junhao Xu","Yanan Zhang","Zhi Cai","Di Huang"],"pdf_url":"https://arxiv.org/pdf/2503.03430v1.pdf","comment":"Accepted at CVPR 2025"},{"id":"http://arxiv.org/abs/2306.17567v3","updated":"2025-03-05T11:52:00Z","published":"2023-06-30T11:40:35Z","title":"Counting Guidance for High Fidelity Text-to-Image Synthesis","summary":" Recently, there have been significant improvements in the quality and\nperformance of text-to-image generation, largely due to the impressive results\nattained by diffusion models. However, text-to-image diffusion models sometimes\nstruggle to create high-fidelity content for the given input prompt. One\nspecific issue is their difficulty in generating the precise number of objects\nspecified in the text prompt. For example, when provided with the prompt \"five\napples and ten lemons on a table,\" images generated by diffusion models often\ncontain an incorrect number of objects. In this paper, we present a method to\nimprove diffusion models so that they accurately produce the correct object\ncount based on the input prompt. We adopt a counting network that performs\nreference-less class-agnostic counting for any given image. We calculate the\ngradients of the counting network and refine the predicted noise for each step.\nTo address the presence of multiple types of objects in the prompt, we utilize\nnovel attention map guidance to obtain high-quality masks for each object.\nFinally, we guide the denoising process using the calculated gradients for each\nobject. Through extensive experiments and evaluation, we demonstrate that the\nproposed method significantly enhances the fidelity of diffusion models with\nrespect to object count. Code is available at\nhttps://github.com/furiosa-ai/counting-guidance.\n","authors":["Wonjun Kang","Kevin Galim","Hyung Il Koo","Nam Ik Cho"],"pdf_url":"https://arxiv.org/pdf/2306.17567v3.pdf","comment":"Accepted at WACV 2025 (Oral). Code is available at\n https://github.com/furiosa-ai/counting-guidance"},{"id":"http://arxiv.org/abs/2503.03422v1","updated":"2025-03-05T11:49:32Z","published":"2025-03-05T11:49:32Z","title":"Automatic Drywall Analysis for Progress Tracking and Quality Control in\n Construction","summary":" Digitalization in the construction industry has become essential, enabling\ncentralized, easy access to all relevant information of a building. Automated\nsystems can facilitate the timely and resource-efficient documentation of\nchanges, which is crucial for key processes such as progress tracking and\nquality control. This paper presents a method for image-based automated drywall\nanalysis enabling construction progress and quality assessment through on-site\ncamera systems. Our proposed solution integrates a deep learning-based instance\nsegmentation model to detect and classify various drywall elements with an\nanalysis module to cluster individual wall segments, estimate camera\nperspective distortions, and apply the corresponding corrections. This system\nextracts valuable information from images, enabling more accurate progress\ntracking and quality assessment on construction sites. Our main contributions\ninclude a fully automated pipeline for drywall analysis, improving instance\nsegmentation accuracy through architecture modifications and targeted data\naugmentation, and a novel algorithm to extract important information from the\nsegmentation results. Our modified model, enhanced with data augmentation,\nachieves significantly higher accuracy compared to other architectures,\noffering more detailed and precise information than existing approaches.\nCombined with the proposed drywall analysis steps, it enables the reliable\nautomation of construction progress and quality assessment.\n","authors":["Mariusz Trzeciakiewicz","Aleixo Cambeiro Barreiro","Niklas Gard","Anna Hilsmann","Peter Eisert"],"pdf_url":"https://arxiv.org/pdf/2503.03422v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03603v5","updated":"2025-03-05T11:48:15Z","published":"2024-12-03T23:52:37Z","title":"HunyuanVideo: A Systematic Framework For Large Video Generative Models","summary":" Recent advancements in video generation have significantly impacted daily\nlife for both individuals and industries. However, the leading video generation\nmodels remain closed-source, resulting in a notable performance gap between\nindustry capabilities and those available to the public. In this report, we\nintroduce HunyuanVideo, an innovative open-source video foundation model that\ndemonstrates performance in video generation comparable to, or even surpassing,\nthat of leading closed-source models. HunyuanVideo encompasses a comprehensive\nframework that integrates several key elements, including data curation,\nadvanced architectural design, progressive model scaling and training, and an\nefficient infrastructure tailored for large-scale model training and inference.\nAs a result, we successfully trained a video generative model with over 13\nbillion parameters, making it the largest among all open-source models. We\nconducted extensive experiments and implemented a series of targeted designs to\nensure high visual quality, motion dynamics, text-video alignment, and advanced\nfilming techniques. According to evaluations by professionals, HunyuanVideo\noutperforms previous state-of-the-art models, including Runway Gen-3, Luma 1.6,\nand three top-performing Chinese video generative models. By releasing the code\nfor the foundation model and its applications, we aim to bridge the gap between\nclosed-source and open-source communities. This initiative will empower\nindividuals within the community to experiment with their ideas, fostering a\nmore dynamic and vibrant video generation ecosystem. The code is publicly\navailable at https://github.com/Tencent/HunyuanVideo.\n","authors":["Weijie Kong","Qi Tian","Zijian Zhang","Rox Min","Zuozhuo Dai","Jin Zhou","Jiangfeng Xiong","Xin Li","Bo Wu","Jianwei Zhang","Kathrina Wu","Qin Lin","Junkun Yuan","Yanxin Long","Aladdin Wang","Andong Wang","Changlin Li","Duojun Huang","Fang Yang","Hao Tan","Hongmei Wang","Jacob Song","Jiawang Bai","Jianbing Wu","Jinbao Xue","Joey Wang","Kai Wang","Mengyang Liu","Pengyu Li","Shuai Li","Weiyan Wang","Wenqing Yu","Xinchi Deng","Yang Li","Yi Chen","Yutao Cui","Yuanbo Peng","Zhentao Yu","Zhiyu He","Zhiyong Xu","Zixiang Zhou","Zunnan Xu","Yangyu Tao","Qinglin Lu","Songtao Liu","Dax Zhou","Hongfa Wang","Yong Yang","Di Wang","Yuhong Liu","Jie Jiang","Caesar Zhong"],"pdf_url":"https://arxiv.org/pdf/2412.03603v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01505v4","updated":"2025-03-05T11:39:35Z","published":"2024-03-03T13:08:32Z","title":"SCott: Accelerating Diffusion Models with Stochastic Consistency\n Distillation","summary":" The iterative sampling procedure employed by diffusion models (DMs) often\nleads to significant inference latency. To address this, we propose Stochastic\nConsistency Distillation (SCott) to enable accelerated text-to-image\ngeneration, where high-quality and diverse generations can be achieved within\njust 2-4 sampling steps. In contrast to vanilla consistency distillation (CD)\nwhich distills the ordinary differential equation solvers-based sampling\nprocess of a pre-trained teacher model into a student, SCott explores the\npossibility and validates the efficacy of integrating stochastic differential\nequation (SDE) solvers into CD to fully unleash the potential of the teacher.\nSCott is augmented with elaborate strategies to control the noise strength and\nsampling process of the SDE solver. An adversarial loss is further incorporated\nto strengthen the consistency constraints in rare sampling steps. Empirically,\non the MSCOCO-2017 5K dataset with a Stable Diffusion-V1.5 teacher, SCott\nachieves an FID of 21.9 with 2 sampling steps, surpassing that of the 1-step\nInstaFlow (23.4) and the 4-step UFOGen (22.1). Moreover, SCott can yield more\ndiverse samples than other consistency models for high-resolution image\ngeneration, with up to 16% improvement in a qualified metric.\n","authors":["Hongjian Liu","Qingsong Xie","TianXiang Ye","Zhijie Deng","Chen Chen","Shixiang Tang","Xueyang Fu","Haonan Lu","Zheng-jun Zha"],"pdf_url":"https://arxiv.org/pdf/2403.01505v4.pdf","comment":"22 pages, 16 figures"},{"id":"http://arxiv.org/abs/2503.03410v1","updated":"2025-03-05T11:39:15Z","published":"2025-03-05T11:39:15Z","title":"Augmentation-Based Deep Learning for Identification of Circulating Tumor\n Cells","summary":" Circulating tumor cells (CTCs) are crucial biomarkers in liquid biopsy,\noffering a noninvasive tool for cancer patient management. However, their\nidentification remains particularly challenging due to their limited number and\nheterogeneity. Labeling samples for contrast limits the generalization of\nfluorescence-based methods across different hospital datasets. Analyzing\nsingle-cell images enables detailed assessment of cell morphology, subcellular\nstructures, and phenotypic variations, often hidden in clustered images.\nDeveloping a method based on bright-field single-cell analysis could overcome\nthese limitations. CTCs can be isolated using an unbiased workflow combining\nParsortix technology, which selects cells based on size and deformability, with\nDEPArray technology, enabling precise visualization and selection of single\ncells. Traditionally, DEPArray-acquired digital images are manually analyzed,\nmaking the process time-consuming and prone to variability. In this study, we\npresent a Deep Learning-based classification pipeline designed to distinguish\nCTCs from leukocytes in blood samples, aimed to enhance diagnostic accuracy and\noptimize clinical workflows. Our approach employs images from the bright-field\nchannel acquired through DEPArray technology leveraging a ResNet-based CNN. To\nimprove model generalization, we applied three types of data augmentation\ntechniques and incorporated fluorescence (DAPI) channel images into the\ntraining phase, allowing the network to learn additional CTC-specific features.\nNotably, only bright-field images have been used for testing, ensuring the\nmodel's ability to identify CTCs without relying on fluorescence markers. The\nproposed model achieved an F1-score of 0.798, demonstrating its capability to\ndistinguish CTCs from leukocytes. These findings highlight the potential of DL\nin refining CTC analysis and advancing liquid biopsy applications.\n","authors":["Martina Russo","Giulia Bertolini","Vera Cappelletti","Cinzia De Marco","Serena Di Cosimo","Petra Paiè","Nadia Brancati"],"pdf_url":"https://arxiv.org/pdf/2503.03410v1.pdf","comment":"20 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2503.03395v1","updated":"2025-03-05T11:19:17Z","published":"2025-03-05T11:19:17Z","title":"AI-Driven Multi-Stage Computer Vision System for Defect Detection in\n Laser-Engraved Industrial Nameplates","summary":" Automated defect detection in industrial manufacturing is essential for\nmaintaining product quality and minimizing production errors. In air disc brake\nmanufacturing, ensuring the precision of laser-engraved nameplates is crucial\nfor accurate product identification and quality control. Engraving errors, such\nas misprints or missing characters, can compromise both aesthetics and\nfunctionality, leading to material waste and production delays. This paper\npresents a proof of concept for an AI-driven computer vision system that\ninspects and verifies laser-engraved nameplates, detecting defects in logos and\nalphanumeric strings. The system integrates object detection using YOLOv7,\noptical character recognition (OCR) with Tesseract, and anomaly detection\nthrough a residual variational autoencoder (ResVAE) along with other computer\nvision methods to enable comprehensive inspections at multiple stages.\nExperimental results demonstrate the system's effectiveness, achieving 91.33%\naccuracy and 100% recall, ensuring that defective nameplates are consistently\ndetected and addressed. This solution highlights the potential of AI-driven\nvisual inspection to enhance quality control, reduce manual inspection efforts,\nand improve overall manufacturing efficiency.\n","authors":["Adhish Anitha Vilasan","Stephan Jäger","Noah Klarmann"],"pdf_url":"https://arxiv.org/pdf/2503.03395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01243v3","updated":"2025-03-05T11:17:18Z","published":"2024-12-02T08:05:26Z","title":"Schedule On the Fly: Diffusion Time Prediction for Faster and Better\n Image Generation","summary":" Diffusion and flow matching models have achieved remarkable success in\ntext-to-image generation. However, these models typically rely on the\npredetermined denoising schedules for all prompts. The multi-step reverse\ndiffusion process can be regarded as a kind of chain-of-thought for generating\nhigh-quality images step by step. Therefore, diffusion models should reason for\neach instance to adaptively determine the optimal noise schedule, achieving\nhigh generation quality with sampling efficiency. In this paper, we introduce\nthe Time Prediction Diffusion Model (TPDM) for this. TPDM employs a\nplug-and-play Time Prediction Module (TPM) that predicts the next noise level\nbased on current latent features at each denoising step. We train the TPM using\nreinforcement learning to maximize a reward that encourages high final image\nquality while penalizing excessive denoising steps. With such an adaptive\nscheduler, TPDM not only generates high-quality images that are aligned closely\nwith human preferences but also adjusts diffusion time and the number of\ndenoising steps on the fly, enhancing both performance and efficiency. With\nStable Diffusion 3 Medium architecture, TPDM achieves an aesthetic score of\n5.44 and a human preference score (HPS) of 29.59, while using around 50% fewer\ndenoising steps to achieve better performance.\n","authors":["Zilyu Ye","Zhiyang Chen","Tiancheng Li","Zemin Huang","Weijian Luo","Guo-Jun Qi"],"pdf_url":"https://arxiv.org/pdf/2412.01243v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14153v2","updated":"2025-03-05T11:15:39Z","published":"2024-08-26T09:55:34Z","title":"Explaining Vision-Language Similarities in Dual Encoders with\n Feature-Pair Attributions","summary":" Dual encoder architectures like CLIP models map two types of inputs into a\nshared embedding space and predict similarities between them. Despite their\nsuccess, it is, however, not understood how these models compare their two\ninputs. Common first-order feature-attribution methods can only provide limited\ninsights into dual-encoders since their predictions depend on\nfeature-interactions rather than on individual features. In this paper, we\nfirst derive a second-order method enabling the attribution of predictions by\nany differentiable dual encoder onto feature-interactions between its inputs.\nSecond, we apply our method to CLIP models and show that they learn\nfine-grained correspondences between parts of captions and regions in images.\nThey match objects across input modes also account for mismatches. This\nvisual-linguistic grounding ability, however, varies heavily between object\nclasses and exhibits pronounced out-of-domain effects. We can identify\nindividual errors as well as systematic failure categories including object\ncoverage, unusual scenes and correlated contexts.\n","authors":["Lucas Möller","Pascal Tilli","Ngoc Thang Vu","Sebastian Padó"],"pdf_url":"https://arxiv.org/pdf/2408.14153v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03370v1","updated":"2025-03-05T10:46:03Z","published":"2025-03-05T10:46:03Z","title":"MIAdapt: Source-free Few-shot Domain Adaptive Object Detection for\n Microscopic Images","summary":" Existing generic unsupervised domain adaptation approaches require access to\nboth a large labeled source dataset and a sufficient unlabeled target dataset\nduring adaptation. However, collecting a large dataset, even if unlabeled, is a\nchallenging and expensive endeavor, especially in medical imaging. In addition,\nconstraints such as privacy issues can result in cases where source data is\nunavailable. Taking in consideration these challenges, we propose MIAdapt, an\nadaptive approach for Microscopic Imagery Adaptation as a solution for\nSource-free Few-shot Domain Adaptive Object detection (SF-FSDA). We also define\ntwo competitive baselines (1) Faster-FreeShot and (2) MT-FreeShot. Extensive\nexperiments on the challenging M5-Malaria and Raabin-WBC datasets validate the\neffectiveness of MIAdapt. Without using any image from the source domain\nMIAdapt surpasses state-of-the-art source-free UDA (SF-UDA) methods by +21.3%\nmAP and few-shot domain adaptation (FSDA) approaches by +4.7% mAP on\nRaabin-WBC. Our code and models will be publicly available.\n","authors":["Nimra Dilawar","Sara Nadeem","Javed Iqbal","Waqas Sultani","Mohsen Ali"],"pdf_url":"https://arxiv.org/pdf/2503.03370v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2503.03367v1","updated":"2025-03-05T10:43:01Z","published":"2025-03-05T10:43:01Z","title":"Top-K Maximum Intensity Projection Priors for 3D Liver Vessel\n Segmentation","summary":" Liver-vessel segmentation is an essential task in the pre-operative planning\nof liver resection. State-of-the-art 2D or 3D convolution-based methods\nfocusing on liver vessel segmentation on 2D CT cross-sectional views, which do\nnot take into account the global liver-vessel topology. To maintain this global\nvessel topology, we rely on the underlying physics used in the CT\nreconstruction process, and apply this to liver-vessel segmentation.\nConcretely, we introduce the concept of top-k maximum intensity projections,\nwhich mimics the CT reconstruction by replacing the integral along each\nprojection direction, with keeping the top-k maxima along each projection\ndirection. We use these top-k maximum projections to condition a diffusion\nmodel and generate 3D liver-vessel trees. We evaluate our 3D liver-vessel\nsegmentation on the 3D-ircadb-01 dataset, and achieve the highest Dice\ncoefficient, intersection-over-union (IoU), and Sensitivity scores compared to\nprior work.\n","authors":["Xiaotong Zhang","Alexander Broersen","Gonnie CM van Erp","Silvia L. Pintea","Jouke Dijkstra"],"pdf_url":"https://arxiv.org/pdf/2503.03367v1.pdf","comment":"Accepted in 2025 IEEE International Symposium on Biomedical Imaging\n (ISBI 2025)"},{"id":"http://arxiv.org/abs/2503.03365v1","updated":"2025-03-05T10:42:41Z","published":"2025-03-05T10:42:41Z","title":"TopoMortar: A dataset to evaluate image segmentation methods focused on\n topology accuracy","summary":" We present TopoMortar, a brick wall dataset that is the first dataset\nspecifically designed to evaluate topology-focused image segmentation methods,\nsuch as topology loss functions. TopoMortar enables to investigate in two ways\nwhether methods incorporate prior topological knowledge. First, by eliminating\nchallenges seen in real-world data, such as small training set, noisy labels,\nand out-of-distribution test-set images, that, as we show, impact the\neffectiveness of topology losses. Second, by allowing to assess in the same\ndataset topology accuracy across dataset challenges, isolating dataset-related\neffects from the effect of incorporating prior topological knowledge. In these\ntwo experiments, it is deliberately difficult to improve topology accuracy\nwithout actually using topology information, thus, permitting to attribute an\nimprovement in topology accuracy to the incorporation of prior topological\nknowledge. To this end, TopoMortar includes three types of labels (accurate,\nnoisy, pseudo-labels), two fixed training sets (large and small), and\nin-distribution and out-of-distribution test-set images. We compared eight loss\nfunctions on TopoMortar, and we found that clDice achieved the most\ntopologically accurate segmentations, Skeleton Recall loss performed best\nparticularly with noisy labels, and the relative advantageousness of the other\nloss functions depended on the experimental setting. Additionally, we show that\nsimple methods, such as data augmentation and self-distillation, can elevate\nCross entropy Dice loss to surpass most topology loss functions, and that those\nsimple methods can enhance topology loss functions as well. clDice and Skeleton\nRecall loss, both skeletonization-based loss functions, were also the fastest\nto train, making this type of loss function a promising research direction.\nTopoMortar and our code can be found at https://github.com/jmlipman/TopoMortar\n","authors":["Juan Miguel Valverde","Motoya Koga","Nijihiko Otsuka","Anders Bjorholm Dahl"],"pdf_url":"https://arxiv.org/pdf/2503.03365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03355v1","updated":"2025-03-05T10:37:51Z","published":"2025-03-05T10:37:51Z","title":"Video Super-Resolution: All You Need is a Video Diffusion Model","summary":" We present a generic video super-resolution algorithm in this paper, based on\nthe Diffusion Posterior Sampling framework with an unconditional video\ngeneration model in latent space. The video generation model, a diffusion\ntransformer, functions as a space-time model. We argue that a powerful model,\nwhich learns the physics of the real world, can easily handle various kinds of\nmotion patterns as prior knowledge, thus eliminating the need for explicit\nestimation of optical flows or motion parameters for pixel alignment.\nFurthermore, a single instance of the proposed video diffusion transformer\nmodel can adapt to different sampling conditions without re-training. Due to\nlimited computational resources and training data, our experiments provide\nempirical evidence of the algorithm's strong super-resolution capabilities\nusing synthetic data.\n","authors":["Zhihao Zhan","Wang Pang","Xiang Zhu","Yechao Bai"],"pdf_url":"https://arxiv.org/pdf/2503.03355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07226v2","updated":"2025-03-05T10:24:18Z","published":"2023-12-12T12:41:35Z","title":"Super-Resolution on Rotationally Scanned Photoacoustic Microscopy Images\n Incorporating Scanning Prior","summary":" Photoacoustic Microscopy (PAM) images integrating the advantages of optical\ncontrast and acoustic resolution have been widely used in brain studies.\nHowever, there exists a trade-off between scanning speed and image resolution.\nCompared with traditional raster scanning, rotational scanning provides good\nopportunities for fast PAM imaging by optimizing the scanning mechanism.\nRecently, there is a trend to incorporate deep learning into the scanning\nprocess to further increase the scanning speed.Yet, most such attempts are\nperformed for raster scanning while those for rotational scanning are\nrelatively rare. In this study, we propose a novel and well-performing\nsuper-resolution framework for rotational scanning-based PAM imaging. To\neliminate adjacent rows' displacements due to subject motion or high-frequency\nscanning distortion,we introduce a registration module across odd and even rows\nin the preprocessing and incorporate displacement degradation in the training.\nBesides, gradient-based patch selection is proposed to increase the probability\nof blood vessel patches being selected for training. A Transformer-based\nnetwork with a global receptive field is applied for better performance.\nExperimental results on both synthetic and real datasets demonstrate the\neffectiveness and generalizability of our proposed framework for rotationally\nscanned PAM images'super-resolution, both quantitatively and qualitatively.\nCode is available at https://github.com/11710615/PAMSR.git.\n","authors":["Kai Pan","Linyang Li","Li Lin","Pujin Cheng","Junyan Lyu","Lei Xi","Xiaoyin Tang"],"pdf_url":"https://arxiv.org/pdf/2312.07226v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.07804v3","updated":"2025-03-05T10:09:25Z","published":"2024-12-09T09:04:02Z","title":"XLSTM-HVED: Cross-Modal Brain Tumor Segmentation and MRI Reconstruction\n Method Using Vision XLSTM and Heteromodal Variational Encoder-Decoder","summary":" Neurogliomas are among the most aggressive forms of cancer, presenting\nconsiderable challenges in both treatment and monitoring due to their\nunpredictable biological behavior. Magnetic resonance imaging (MRI) is\ncurrently the preferred method for diagnosing and monitoring gliomas. However,\nthe lack of specific imaging techniques often compromises the accuracy of tumor\nsegmentation during the imaging process. To address this issue, we introduce\nthe XLSTM-HVED model. This model integrates a hetero-modal encoder-decoder\nframework with the Vision XLSTM module to reconstruct missing MRI modalities.\nBy deeply fusing spatial and temporal features, it enhances tumor segmentation\nperformance. The key innovation of our approach is the Self-Attention\nVariational Encoder (SAVE) module, which improves the integration of modal\nfeatures. Additionally, it optimizes the interaction of features between\nsegmentation and reconstruction tasks through the Squeeze-Fusion-Excitation\nCross Awareness (SFECA) module. Our experiments using the BraTS 2024 dataset\ndemonstrate that our model significantly outperforms existing advanced methods\nin handling cases where modalities are missing. Our source code is available at\nhttps://github.com/Quanato607/XLSTM-HVED.\n","authors":["Shenghao Zhu","Yifei Chen","Shuo Jiang","Weihong Chen","Chang Liu","Yuanhan Wang","Xu Chen","Yifan Ke","Feiwei Qin","Changmiao Wang","Zhu Zhu"],"pdf_url":"https://arxiv.org/pdf/2412.07804v3.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2503.03330v1","updated":"2025-03-05T10:03:21Z","published":"2025-03-05T10:03:21Z","title":"Automated Attendee Recognition System for Large-Scale Social Events or\n Conference Gathering","summary":" Manual attendance tracking at large-scale events, such as marriage functions\nor conferences, is often inefficient and prone to human error. To address this\nchallenge, we propose an automated, cloud-based attendance tracking system that\nuses cameras mounted at the entrance and exit gates. The mounted cameras\ncontinuously capture video and send the video data to cloud services to perform\nreal-time face detection and recognition. Unlike existing solutions, our system\naccurately identifies attendees even when they are not looking directly at the\ncamera, allowing natural movements, such as looking around or talking while\nwalking. To the best of our knowledge, this is the first system to achieve high\nrecognition rates under such dynamic conditions. Our system demonstrates\noverall 90% accuracy, with each video frame processed in 5 seconds, ensuring\nreal time operation without frame loss. In addition, notifications are sent\npromptly to security personnel within the same latency. This system achieves\n100% accuracy for individuals without facial obstructions and successfully\nrecognizes all attendees appearing within the camera's field of view, providing\na robust solution for attendee recognition in large-scale social events.\n","authors":["Dhruv Motwani","Ankush Tyagi","Vipul Dabhi","Harshadkumar Prajapati"],"pdf_url":"https://arxiv.org/pdf/2503.03330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03329v1","updated":"2025-03-05T10:02:35Z","published":"2025-03-05T10:02:35Z","title":"Deep Learning-Based Diffusion MRI Tractography: Integrating Spatial and\n Anatomical Information","summary":" Diffusion MRI tractography technique enables non-invasive visualization of\nthe white matter pathways in the brain. It plays a crucial role in neuroscience\nand clinical fields by facilitating the study of brain connectivity and\nneurological disorders. However, the accuracy of reconstructed tractograms has\nbeen a longstanding challenge. Recently, deep learning methods have been\napplied to improve tractograms for better white matter coverage, but often\ncomes at the expense of generating excessive false-positive connections. This\nis largely due to their reliance on local information to predict long range\nstreamlines. To improve the accuracy of streamline propagation predictions, we\nintroduce a novel deep learning framework that integrates image-domain spatial\ninformation and anatomical information along tracts, with the former extracted\nthrough convolutional layers and the later modeled via a Transformer-decoder.\nAdditionally, we employ a weighted loss function to address fiber class\nimbalance encountered during training. We evaluate the proposed method on the\nsimulated ISMRM 2015 Tractography Challenge dataset, achieving a valid\nstreamline rate of 66.2%, white matter coverage of 63.8%, and successfully\nreconstructing 24 out of 25 bundles. Furthermore, on the multi-site\nTractoinferno dataset, the proposed method demonstrates its ability to handle\nvarious diffusion MRI acquisition schemes, achieving a 5.7% increase in white\nmatter coverage and a 4.1% decrease in overreach compared to RNN-based methods.\n","authors":["Yiqiong Yang","Yitian Yuan","Baoxing Ren","Ye Wu","Yanqiu Feng","Xinyuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.03329v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03327v1","updated":"2025-03-05T10:00:32Z","published":"2025-03-05T10:00:32Z","title":"ScaleFusionNet: Transformer-Guided Multi-Scale Feature Fusion for Skin\n Lesion Segmentation","summary":" Melanoma is a malignant tumor originating from skin cell lesions. Accurate\nand efficient segmentation of skin lesions is essential for quantitative\nmedical analysis but remains challenging. To address this, we propose\nScaleFusionNet, a segmentation model that integrates Cross-Attention\nTransformer Module (CATM) and AdaptiveFusionBlock to enhance feature extraction\nand fusion. The model employs a hybrid architecture encoder that effectively\ncaptures both local and global features. We introduce CATM, which utilizes Swin\nTransformer Blocks and Cross Attention Fusion (CAF) to adaptively refine\nencoder-decoder feature fusion, reducing semantic gaps and improving\nsegmentation accuracy. Additionally, the AdaptiveFusionBlock is improved by\nintegrating adaptive multi-scale fusion, where Swin Transformer-based attention\ncomplements deformable convolution-based multi-scale feature extraction. This\nenhancement refines lesion boundaries and preserves fine-grained details.\nScaleFusionNet achieves Dice scores of 92.94% and 91.65% on ISIC-2016 and\nISIC-2018 datasets, respectively, demonstrating its effectiveness in skin\nlesion analysis. Our code implementation is publicly available at GitHub.\n","authors":["Saqib Qamar","Syed Furqan Qadri","Roobaea Alroobaea","Majed Alsafyani","Abdullah M. Baqasah"],"pdf_url":"https://arxiv.org/pdf/2503.03327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03325v1","updated":"2025-03-05T09:59:23Z","published":"2025-03-05T09:59:23Z","title":"Golden Cudgel Network for Real-Time Semantic Segmentation","summary":" Recent real-time semantic segmentation models, whether single-branch or\nmulti-branch, achieve good performance and speed. However, their speed is\nlimited by multi-path blocks, and some depend on high-performance teacher\nmodels for training. To overcome these issues, we propose Golden Cudgel Network\n(GCNet). Specifically, GCNet uses vertical multi-convolutions and horizontal\nmulti-paths for training, which are reparameterized into a single convolution\nfor inference, optimizing both performance and speed. This design allows GCNet\nto self-enlarge during training and self-contract during inference, effectively\nbecoming a \"teacher model\" without needing external ones. Experimental results\nshow that GCNet outperforms existing state-of-the-art models in terms of\nperformance and speed on the Cityscapes, CamVid, and Pascal VOC 2012 datasets.\nThe code is available at https://github.com/gyyang23/GCNet.\n","authors":["Guoyu Yang","Yuan Wang","Daming Shi","Yanzhong Wang"],"pdf_url":"https://arxiv.org/pdf/2503.03325v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03321v1","updated":"2025-03-05T09:55:07Z","published":"2025-03-05T09:55:07Z","title":"See What You Are Told: Visual Attention Sink in Large Multimodal Models","summary":" Large multimodal models (LMMs) \"see\" images by leveraging the attention\nmechanism between text and visual tokens in the transformer decoder. Ideally,\nthese models should focus on key visual information relevant to the text token.\nHowever, recent findings indicate that LMMs have an extraordinary tendency to\nconsistently allocate high attention weights to specific visual tokens, even\nwhen these tokens are irrelevant to the corresponding text. In this study, we\ninvestigate the property behind the appearance of these irrelevant visual\ntokens and examine their characteristics. Our findings show that this behavior\narises due to the massive activation of certain hidden state dimensions, which\nresembles the attention sink found in language models. Hence, we refer to this\nphenomenon as the visual attention sink. In particular, our analysis reveals\nthat removing the irrelevant visual sink tokens does not impact model\nperformance, despite receiving high attention weights. Consequently, we recycle\nthe attention to these tokens as surplus resources, redistributing the\nattention budget to enhance focus on the image. To achieve this, we introduce\nVisual Attention Redistribution (VAR), a method that redistributes attention in\nimage-centric heads, which we identify as innately focusing on visual\ninformation. VAR can be seamlessly applied across different LMMs to improve\nperformance on a wide range of tasks, including general vision-language tasks,\nvisual hallucination tasks, and vision-centric tasks, all without the need for\nadditional training, models, or inference steps. Experimental results\ndemonstrate that VAR enables LMMs to process visual information more\neffectively by adjusting their internal attention mechanisms, offering a new\ndirection to enhancing the multimodal capabilities of LMMs.\n","authors":["Seil Kang","Jinyeong Kim","Junhyeok Kim","Seong Jae Hwang"],"pdf_url":"https://arxiv.org/pdf/2503.03321v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09510v5","updated":"2025-03-05T09:44:52Z","published":"2024-06-17T11:43:38Z","title":"3DGS.zip: A survey on 3D Gaussian Splatting Compression Methods","summary":" 3D Gaussian Splatting (3DGS) has emerged as a cutting-edge technique for\nreal-time radiance field rendering, offering state-of-the-art performance in\nterms of both quality and speed. 3DGS models a scene as a collection of\nthree-dimensional Gaussians, with additional attributes optimized to conform to\nthe scene's geometric and visual properties. Despite its advantages in\nrendering speed and image fidelity, 3DGS is limited by its significant storage\nand memory demands. These high demands make 3DGS impractical for mobile devices\nor headsets, reducing its applicability in important areas of computer\ngraphics. To address these challenges and advance the practicality of 3DGS,\nthis survey provides a comprehensive and detailed examination of compression\nand compaction techniques developed to make 3DGS more efficient. We classify\nexisting methods into two categories: compression, which focuses on reducing\nfile size, and compaction, which aims to minimize the number of Gaussians. Both\nmethods aim to maintain or improve quality, each by minimizing its respective\nattribute: file size for compression and Gaussian count for compaction. We\nintroduce the basic mathematical concepts underlying the analyzed methods, as\nwell as key implementation details and design choices. Our report thoroughly\ndiscusses similarities and differences among the methods, as well as their\nrespective advantages and disadvantages. We establish a consistent framework\nfor comparing the surveyed methods based on key performance metrics and\ndatasets. Specifically, since these methods have been developed in parallel and\nover a short period of time, currently, no comprehensive comparison exists.\nThis survey, for the first time, presents a unified framework to evaluate 3DGS\ncompression techniques. We maintain a website that will be regularly updated\nwith emerging methods: https://w-m.github.io/3dgs-compression-survey/ .\n","authors":["Milena T. Bagdasarian","Paul Knoll","Yi-Hsin Li","Florian Barthel","Anna Hilsmann","Peter Eisert","Wieland Morgenstern"],"pdf_url":"https://arxiv.org/pdf/2407.09510v5.pdf","comment":"3D Gaussian Splatting compression survey; 3DGS compression; updated\n discussion; new approaches added; new illustrations"},{"id":"http://arxiv.org/abs/2503.03307v1","updated":"2025-03-05T09:39:51Z","published":"2025-03-05T09:39:51Z","title":"Full-DoF Egomotion Estimation for Event Cameras Using Geometric Solvers","summary":" For event cameras, current sparse geometric solvers for egomotion estimation\nassume that the rotational displacements are known, such as those provided by\nan IMU. Thus, they can only recover the translational motion parameters.\nRecovering full-DoF motion parameters using a sparse geometric solver is a more\nchallenging task, and has not yet been investigated. In this paper, we propose\nseveral solvers to estimate both rotational and translational velocities within\na unified framework. Our method leverages event manifolds induced by line\nsegments. The problem formulations are based on either an incidence relation\nfor lines or a novel coplanarity relation for normal vectors. We demonstrate\nthe possibility of recovering full-DoF egomotion parameters for both angular\nand linear velocities without requiring extra sensor measurements or motion\npriors. To achieve efficient optimization, we exploit the Adam framework with a\nfirst-order approximation of rotations for quick initialization. Experiments on\nboth synthetic and real-world data demonstrate the effectiveness of our method.\nThe code is available at https://github.com/jizhaox/relpose-event.\n","authors":["Ji Zhao","Banglei Guan","Zibin Liu","Laurent Kneip"],"pdf_url":"https://arxiv.org/pdf/2503.03307v1.pdf","comment":"Accepted by IEEE/CVF Conference on Computer Vision and Pattern\n Recognition (CVPR), 2025"},{"id":"http://arxiv.org/abs/2503.03299v1","updated":"2025-03-05T09:30:49Z","published":"2025-03-05T09:30:49Z","title":"Label-Efficient LiDAR Semantic Segmentation with 2D-3D Vision\n Transformer Adapters","summary":" LiDAR semantic segmentation models are typically trained from random\ninitialization as universal pre-training is hindered by the lack of large,\ndiverse datasets. Moreover, most point cloud segmentation architectures\nincorporate custom network layers, limiting the transferability of advances\nfrom vision-based architectures. Inspired by recent advances in universal\nfoundation models, we propose BALViT, a novel approach that leverages frozen\nvision models as amodal feature encoders for learning strong LiDAR encoders.\nSpecifically, BALViT incorporates both range-view and bird's-eye-view LiDAR\nencoding mechanisms, which we combine through a novel 2D-3D adapter. While the\nrange-view features are processed through a frozen image backbone, our\nbird's-eye-view branch enhances them through multiple cross-attention\ninteractions. Thereby, we continuously improve the vision network with\ndomain-dependent knowledge, resulting in a strong label-efficient LiDAR\nencoding mechanism. Extensive evaluations of BALViT on the SemanticKITTI and\nnuScenes benchmarks demonstrate that it outperforms state-of-the-art methods on\nsmall data regimes. We make the code and models publicly available at:\nhttp://balvit.cs.uni-freiburg.de.\n","authors":["Julia Hindel","Rohit Mohan","Jelena Bratulic","Daniele Cattaneo","Thomas Brox","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2503.03299v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03294v1","updated":"2025-03-05T09:18:27Z","published":"2025-03-05T09:18:27Z","title":"Interactive Segmentation and Report Generation for CT Images","summary":" Automated CT report generation plays a crucial role in improving diagnostic\naccuracy and clinical workflow efficiency. However, existing methods lack\ninterpretability and impede patient-clinician understanding, while their static\nnature restricts radiologists from dynamically adjusting assessments during\nimage review. Inspired by interactive segmentation techniques, we propose a\nnovel interactive framework for 3D lesion morphology reporting that seamlessly\ngenerates segmentation masks with comprehensive attribute descriptions,\nenabling clinicians to generate detailed lesion profiles for enhanced\ndiagnostic assessment. To our best knowledge, we are the first to integrate the\ninteractive segmentation and structured reports in 3D CT medical images.\nExperimental results across 15 lesion types demonstrate the effectiveness of\nour approach in providing a more comprehensive and reliable reporting system\nfor lesion segmentation and capturing. The source code will be made publicly\navailable following paper acceptance.\n","authors":["Yannian Gu","Wenhui Lei","Hanyu Chen","Xiaofan Zhang","Shaoting Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.03294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03287v1","updated":"2025-03-05T09:13:40Z","published":"2025-03-05T09:13:40Z","title":"Deep Understanding of Sign Language for Sign to Subtitle Alignment","summary":" The objective of this work is to align asynchronous subtitles in sign\nlanguage videos with limited labelled data. To achieve this goal, we propose a\nnovel framework with the following contributions: (1) we leverage fundamental\ngrammatical rules of British Sign Language (BSL) to pre-process the input\nsubtitles, (2) we design a selective alignment loss to optimise the model for\npredicting the temporal location of signs only when the queried sign actually\noccurs in a scene, and (3) we conduct self-training with refined pseudo-labels\nwhich are more accurate than the heuristic audio-aligned labels. From this, our\nmodel not only better understands the correlation between the text and the\nsigns, but also holds potential for application in the translation of sign\nlanguages, particularly in scenarios where manual labelling of large-scale sign\ndata is impractical or challenging. Extensive experimental results demonstrate\nthat our approach achieves state-of-the-art results, surpassing previous\nbaselines by substantial margins in terms of both frame-level accuracy and\nF1-score. This highlights the effectiveness and practicality of our framework\nin advancing the field of sign language video alignment and translation.\n","authors":["Youngjoon Jang","Jeongsoo Choi","Junseok Ahn","Joon Son Chung"],"pdf_url":"https://arxiv.org/pdf/2503.03287v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03286v1","updated":"2025-03-05T09:13:19Z","published":"2025-03-05T09:13:19Z","title":"Enhancing Visual Forced Alignment with Local Context-Aware Feature\n Extraction and Multi-Task Learning","summary":" This paper introduces a novel approach to Visual Forced Alignment (VFA),\naiming to accurately synchronize utterances with corresponding lip movements,\nwithout relying on audio cues. We propose a novel VFA approach that integrates\na local context-aware feature extractor and employs multi-task learning to\nrefine both global and local context features, enhancing sensitivity to subtle\nlip movements for precise word-level and phoneme-level alignment. Incorporating\nthe improved Viterbi algorithm for post-processing, our method significantly\nreduces misalignments. Experimental results show our approach outperforms\nexisting methods, achieving a 6% accuracy improvement at the word-level and 27%\nimprovement at the phoneme-level in LRS2 dataset. These improvements offer new\npotential for applications in automatically subtitling TV shows or\nuser-generated content platforms like TikTok and YouTube Shorts.\n","authors":["Yi He","Lei Yang","Shilin Wang"],"pdf_url":"https://arxiv.org/pdf/2503.03286v1.pdf","comment":"Accepted by ICASSP2025"},{"id":"http://arxiv.org/abs/2503.03285v1","updated":"2025-03-05T09:12:16Z","published":"2025-03-05T09:12:16Z","title":"Enhancing Vietnamese VQA through Curriculum Learning on Raw and\n Augmented Text Representations","summary":" Visual Question Answering (VQA) is a multimodal task requiring reasoning\nacross textual and visual inputs, which becomes particularly challenging in\nlow-resource languages like Vietnamese due to linguistic variability and the\nlack of high-quality datasets. Traditional methods often rely heavily on\nextensive annotated datasets, computationally expensive pipelines, and large\npre-trained models, specifically in the domain of Vietnamese VQA, limiting\ntheir applicability in such scenarios. To address these limitations, we propose\na training framework that combines a paraphrase-based feature augmentation\nmodule with a dynamic curriculum learning strategy. Explicitly, augmented\nsamples are considered \"easy\" while raw samples are regarded as \"hard\". The\nframework then utilizes a mechanism that dynamically adjusts the ratio of easy\nto hard samples during training, progressively modifying the same dataset to\nincrease its difficulty level. By enabling gradual adaptation to task\ncomplexity, this approach helps the Vietnamese VQA model generalize well, thus\nimproving overall performance. Experimental results show consistent\nimprovements on the OpenViVQA dataset and mixed outcomes on the ViVQA dataset,\nhighlighting both the potential and challenges of our approach in advancing VQA\nfor Vietnamese language.\n","authors":["Khoi Anh Nguyen","Linh Yen Vu","Thang Dinh Duong","Thuan Nguyen Duong","Huy Thanh Nguyen","Vinh Quang Dinh"],"pdf_url":"https://arxiv.org/pdf/2503.03285v1.pdf","comment":"10 pages, 3 figures, AAAI-25 Workshop on Document Understanding and\n Intelligence"},{"id":"http://arxiv.org/abs/2503.03284v1","updated":"2025-03-05T09:12:12Z","published":"2025-03-05T09:12:12Z","title":"Gaussian highpass guided image filtering","summary":" Guided image filtering (GIF) is a popular smoothing technique, in which an\nadditional image is used as a structure guidance for noise removal with edge\npreservation. The original GIF and some of its subsequent improvements are\nderived from a two-parameter local affine model (LAM), where the filtering\noutput is a local affine transformation of the guidance image, but the input\nimage is not taken into account in the LAM formulation. In this paper, we first\nintroduce a single-parameter Prior Model based on Gaussian (highpass/lowpass)\nFiltering (PM-GF), in which the filtering output is the sum of a weighted\nportion of Gaussian highpass filtering of the guidance image and Gaussian\nsmoothing of the input image. In the PM-GF, the guidance structure determined\nby Gaussian highpass filtering is obviously transferred to the filtering\noutput, thereby better revealing the structure transfer mechanism of guided\nfiltering. Then we propose several Gaussian highpass GIFs (GH-GIFs) based on\nthe PM-GF by emulating the original GIF and some improvements, i.e., using\nPM-GF instead of LAM in these GIFs. Experimental results illustrate that the\nproposed GIFs outperform their counterparts in several image processing\napplications.\n","authors":["Lei Zhao","Chuanjiang He"],"pdf_url":"https://arxiv.org/pdf/2503.03284v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03280v1","updated":"2025-03-05T09:03:46Z","published":"2025-03-05T09:03:46Z","title":"BEVMOSNet: Multimodal Fusion for BEV Moving Object Segmentation","summary":" Accurate motion understanding of the dynamic objects within the scene in\nbird's-eye-view (BEV) is critical to ensure a reliable obstacle avoidance\nsystem and smooth path planning for autonomous vehicles. However, this task has\nreceived relatively limited exploration when compared to object detection and\nsegmentation with only a few recent vision-based approaches presenting\npreliminary findings that significantly deteriorate in low-light, nighttime,\nand adverse weather conditions such as rain. Conversely, LiDAR and radar\nsensors remain almost unaffected in these scenarios, and radar provides key\nvelocity information of the objects. Therefore, we introduce BEVMOSNet, to our\nknowledge, the first end-to-end multimodal fusion leveraging cameras, LiDAR,\nand radar to precisely predict the moving objects in BEV. In addition, we\nperform a deeper analysis to find out the optimal strategy for deformable\ncross-attention-guided sensor fusion for cross-sensor knowledge sharing in BEV.\nWhile evaluating BEVMOSNet on the nuScenes dataset, we show an overall\nimprovement in IoU score of 36.59% compared to the vision-based unimodal\nbaseline BEV-MoSeg (Sigatapu et al., 2023), and 2.35% compared to the\nmultimodel SimpleBEV (Harley et al., 2022), extended for the motion\nsegmentation task, establishing this method as the state-of-the-art in BEV\nmotion segmentation.\n","authors":["Hiep Truong Cong","Ajay Kumar Sigatapu","Arindam Das","Yashwanth Sharma","Venkatesh Satagopan","Ganesh Sistu","Ciaran Eising"],"pdf_url":"https://arxiv.org/pdf/2503.03280v1.pdf","comment":"In Proceedings of the 20th International Joint Conference on Computer\n Vision, Imaging and Computer Graphics Theory and Applications (2025)"},{"id":"http://arxiv.org/abs/2412.12843v2","updated":"2025-03-05T09:03:18Z","published":"2024-12-17T12:11:04Z","title":"SLTNet: Efficient Event-based Semantic Segmentation with Spike-driven\n Lightweight Transformer-based Networks","summary":" Event-based semantic segmentation has great potential in autonomous driving\nand robotics due to the advantages of event cameras, such as high dynamic\nrange, low latency, and low power cost. Unfortunately, current artificial\nneural network (ANN)-based segmentation methods suffer from high computational\ndemands, the requirements for image frames, and massive energy consumption,\nlimiting their efficiency and application on resource-constrained edge/mobile\nplatforms. To address these problems, we introduce SLTNet, a spike-driven\nlightweight transformer-based network designed for event-based semantic\nsegmentation. Specifically, SLTNet is built on efficient spike-driven\nconvolution blocks (SCBs) to extract rich semantic features while reducing the\nmodel's parameters. Then, to enhance the long-range contextural feature\ninteraction, we propose novel spike-driven transformer blocks (STBs) with\nbinary mask operations. Based on these basic blocks, SLTNet employs a\nhigh-efficiency single-branch architecture while maintaining the low energy\nconsumption of the Spiking Neural Network (SNN). Finally, extensive experiments\non DDD17 and DSEC-Semantic datasets demonstrate that SLTNet outperforms\nstate-of-the-art (SOTA) SNN-based methods by at most 9.06% and 9.39% mIoU,\nrespectively, with extremely 4.58x lower energy consumption and 114 FPS\ninference speed. Our code is open-sourced and available at\nhttps://github.com/longxianlei/SLTNet-v1.0.\n","authors":["Xiaxin Zhu","Fangming Guo","Xianlei Long","Qingyi Gu","Chao Chen","Fuqiang Gu"],"pdf_url":"https://arxiv.org/pdf/2412.12843v2.pdf","comment":"Submitted to 2025 IEEE/RSJ International Conference on Intelligent\n Robots and Systems (IROS 2025)"},{"id":"http://arxiv.org/abs/2503.03278v1","updated":"2025-03-05T09:02:33Z","published":"2025-03-05T09:02:33Z","title":"Enhancing Abnormality Grounding for Vision Language Models with\n Knowledge Descriptions","summary":" Visual Language Models (VLMs) have demonstrated impressive capabilities in\nvisual grounding tasks. However, their effectiveness in the medical domain,\nparticularly for abnormality detection and localization within medical images,\nremains underexplored. A major challenge is the complex and abstract nature of\nmedical terminology, which makes it difficult to directly associate\npathological anomaly terms with their corresponding visual features. In this\nwork, we introduce a novel approach to enhance VLM performance in medical\nabnormality detection and localization by leveraging decomposed medical\nknowledge. Instead of directly prompting models to recognize specific\nabnormalities, we focus on breaking down medical concepts into fundamental\nattributes and common visual patterns. This strategy promotes a stronger\nalignment between textual descriptions and visual features, improving both the\nrecognition and localization of abnormalities in medical images.We evaluate our\nmethod on the 0.23B Florence-2 base model and demonstrate that it achieves\ncomparable performance in abnormality grounding to significantly larger 7B\nLLaVA-based medical VLMs, despite being trained on only 1.5% of the data used\nfor such models. Experimental results also demonstrate the effectiveness of our\napproach in both known and previously unseen abnormalities, suggesting its\nstrong generalization capabilities.\n","authors":["Jun Li","Che Liu","Wenjia Bai","Rossella Arcucci","Cosmin I. Bercea","Julia A. Schnabel"],"pdf_url":"https://arxiv.org/pdf/2503.03278v1.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2503.03272v1","updated":"2025-03-05T08:52:55Z","published":"2025-03-05T08:52:55Z","title":"Towards Effective and Sparse Adversarial Attack on Spiking Neural\n Networks via Breaking Invisible Surrogate Gradients","summary":" Spiking neural networks (SNNs) have shown their competence in handling\nspatial-temporal event-based data with low energy consumption. Similar to\nconventional artificial neural networks (ANNs), SNNs are also vulnerable to\ngradient-based adversarial attacks, wherein gradients are calculated by\nspatial-temporal back-propagation (STBP) and surrogate gradients (SGs).\nHowever, the SGs may be invisible for an inference-only model as they do not\ninfluence the inference results, and current gradient-based attacks are\nineffective for binary dynamic images captured by the dynamic vision sensor\n(DVS). While some approaches addressed the issue of invisible SGs through\nuniversal SGs, their SGs lack a correlation with the victim model, resulting in\nsub-optimal performance. Moreover, the imperceptibility of existing SNN-based\nbinary attacks is still insufficient. In this paper, we introduce an innovative\npotential-dependent surrogate gradient (PDSG) method to establish a robust\nconnection between the SG and the model, thereby enhancing the adaptability of\nadversarial attacks across various models with invisible SGs. Additionally, we\npropose the sparse dynamic attack (SDA) to effectively attack binary dynamic\nimages. Utilizing a generation-reduction paradigm, SDA can fully optimize the\nsparsity of adversarial perturbations. Experimental results demonstrate that\nour PDSG and SDA outperform state-of-the-art SNN-based attacks across various\nmodels and datasets. Specifically, our PDSG achieves 100% attack success rate\non ImageNet, and our SDA obtains 82% attack success rate by modifying only\n0.24% of the pixels on CIFAR10DVS. The code is available at\nhttps://github.com/ryime/PDSG-SDA .\n","authors":["Li Lun","Kunyu Feng","Qinglong Ni","Ling Liang","Yuan Wang","Ying Li","Dunshan Yu","Xiaoxin Cui"],"pdf_url":"https://arxiv.org/pdf/2503.03272v1.pdf","comment":"Accepted by CVPR 2025"},{"id":"http://arxiv.org/abs/2503.03270v1","updated":"2025-03-05T08:51:55Z","published":"2025-03-05T08:51:55Z","title":"Reduced Spatial Dependency for More General Video-level Deepfake\n Detection","summary":" As one of the prominent AI-generated content, Deepfake has raised significant\nsafety concerns. Although it has been demonstrated that temporal consistency\ncues offer better generalization capability, existing methods based on CNNs\ninevitably introduce spatial bias, which hinders the extraction of intrinsic\ntemporal features. To address this issue, we propose a novel method called\nSpatial Dependency Reduction (SDR), which integrates common temporal\nconsistency features from multiple spatially-perturbed clusters, to reduce the\ndependency of the model on spatial information. Specifically, we design\nmultiple Spatial Perturbation Branch (SPB) to construct spatially-perturbed\nfeature clusters. Subsequently, we utilize the theory of mutual information and\npropose a Task-Relevant Feature Integration (TRFI) module to capture temporal\nfeatures residing in similar latent space from these clusters. Finally, the\nintegrated feature is fed into a temporal transformer to capture long-range\ndependencies. Extensive benchmarks and ablation studies demonstrate the\neffectiveness and rationale of our approach.\n","authors":["Beilin Chu","Xuan Xu","Yufei Zhang","Weike You","Linna Zhou"],"pdf_url":"https://arxiv.org/pdf/2503.03270v1.pdf","comment":"5 pages, 2 figures. Accepted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2503.03265v1","updated":"2025-03-05T08:47:36Z","published":"2025-03-05T08:47:36Z","title":"Optimizing for the Shortest Path in Denoising Diffusion Model","summary":" In this research, we propose a novel denoising diffusion model based on\nshortest-path modeling that optimizes residual propagation to enhance both\ndenoising efficiency and quality.Drawing on Denoising Diffusion Implicit Models\n(DDIM) and insights from graph theory, our model, termed the Shortest Path\nDiffusion Model (ShortDF), treats the denoising process as a shortest-path\nproblem aimed at minimizing reconstruction error. By optimizing the initial\nresiduals, we improve the efficiency of the reverse diffusion process and the\nquality of the generated samples.Extensive experiments on multiple standard\nbenchmarks demonstrate that ShortDF significantly reduces diffusion time (or\nsteps) while enhancing the visual fidelity of generated samples compared to\nprior arts.This work, we suppose, paves the way for interactive diffusion-based\napplications and establishes a foundation for rapid data generation. Code is\navailable at https://github.com/UnicomAI/ShortDF.\n","authors":["Ping Chen","Xingpeng Zhang","Zhaoxiang Liu","Huan Hu","Xiang Liu","Kai Wang","Min Wang","Yanlin Qian","Shiguo Lian"],"pdf_url":"https://arxiv.org/pdf/2503.03265v1.pdf","comment":"Accepet by CVPR 2025 (10 pages, 6 figures)"},{"id":"http://arxiv.org/abs/2408.07246v3","updated":"2025-03-05T08:43:44Z","published":"2024-08-14T01:16:40Z","title":"ChemVLM: Exploring the Power of Multimodal Large Language Models in\n Chemistry Area","summary":" Large Language Models (LLMs) have achieved remarkable success and have been\napplied across various scientific fields, including chemistry. However, many\nchemical tasks require the processing of visual information, which cannot be\nsuccessfully handled by existing chemical LLMs. This brings a growing need for\nmodels capable of integrating multimodal information in the chemical domain. In\nthis paper, we introduce \\textbf{ChemVLM}, an open-source chemical multimodal\nlarge language model specifically designed for chemical applications. ChemVLM\nis trained on a carefully curated bilingual multimodal dataset that enhances\nits ability to understand both textual and visual chemical information,\nincluding molecular structures, reactions, and chemistry examination questions.\nWe develop three datasets for comprehensive evaluation, tailored to Chemical\nOptical Character Recognition (OCR), Multimodal Chemical Reasoning (MMCR), and\nMultimodal Molecule Understanding tasks. We benchmark ChemVLM against a range\nof open-source and proprietary multimodal large language models on various\ntasks. Experimental results demonstrate that ChemVLM achieves competitive\nperformance across all evaluated tasks. Our model can be found at\nhttps://huggingface.co/AI4Chem/ChemVLM-26B.\n","authors":["Junxian Li","Di Zhang","Xunzhi Wang","Zeying Hao","Jingdi Lei","Qian Tan","Cai Zhou","Wei Liu","Yaotian Yang","Xinrui Xiong","Weiyun Wang","Zhe Chen","Wenhai Wang","Wei Li","Shufei Zhang","Mao Su","Wanli Ouyang","Yuqiang Li","Dongzhan Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.07246v3.pdf","comment":"11 pages, updated version"},{"id":"http://arxiv.org/abs/2503.03262v1","updated":"2025-03-05T08:38:51Z","published":"2025-03-05T08:38:51Z","title":"Trajectory Prediction for Autonomous Driving: Progress, Limitations, and\n Future Directions","summary":" As the potential for autonomous vehicles to be integrated on a large scale\ninto modern traffic systems continues to grow, ensuring safe navigation in\ndynamic environments is crucial for smooth integration. To guarantee safety and\nprevent collisions, autonomous vehicles must be capable of accurately\npredicting the trajectories of surrounding traffic agents. Over the past\ndecade, significant efforts from both academia and industry have been dedicated\nto designing solutions for precise trajectory forecasting. These efforts have\nproduced a diverse range of approaches, raising questions about the differences\nbetween these methods and whether trajectory prediction challenges have been\nfully addressed. This paper reviews a substantial portion of recent trajectory\nprediction methods and devises a taxonomy to classify existing solutions. A\ngeneral overview of the prediction pipeline is also provided, covering input\nand output modalities, modeling features, and prediction paradigms discussed in\nthe literature. In addition, the paper discusses active research areas within\ntrajectory prediction, addresses the posed research questions, and highlights\nthe remaining research gaps and challenges.\n","authors":["Nadya Abdel Madjid","Abdulrahman Ahmad","Murad Mebrahtu","Yousef Babaa","Abdelmoamen Nasser","Sumbal Malik","Bilal Hassan","Naoufel Werghi","Jorge Dias","Majid Khonji"],"pdf_url":"https://arxiv.org/pdf/2503.03262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05274v2","updated":"2025-03-05T08:36:27Z","published":"2024-09-17T10:08:37Z","title":"Scale-Invariant Object Detection by Adaptive Convolution with Unified\n Global-Local Context","summary":" Dense features are important for detecting minute objects in images.\nUnfortunately, despite the remarkable efficacy of the CNN models in multi-scale\nobject detection, CNN models often fail to detect smaller objects in images due\nto the loss of dense features during the pooling process. Atrous convolution\naddresses this issue by applying sparse kernels. However, sparse kernels often\ncan lose the multi-scale detection efficacy of the CNN model. In this paper, we\npropose an object detection model using a Switchable (adaptive) Atrous\nConvolutional Network (SAC-Net) based on the efficientDet model. A fixed atrous\nrate limits the performance of the CNN models in the convolutional layers. To\novercome this limitation, we introduce a switchable mechanism that allows for\ndynamically adjusting the atrous rate during the forward pass. The proposed\nSAC-Net encapsulates the benefits of both low-level and high-level features to\nachieve improved performance on multi-scale object detection tasks, without\nlosing the dense features. Further, we apply a depth-wise switchable atrous\nrate to the proposed network, to improve the scale-invariant features. Finally,\nwe apply global context on the proposed model. Our extensive experiments on\nbenchmark datasets demonstrate that the proposed SAC-Net outperforms the\nstate-of-the-art models by a significant margin in terms of accuracy.\n","authors":["Amrita Singh","Snehasis Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2410.05274v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06927v3","updated":"2025-03-05T08:35:41Z","published":"2024-08-13T14:29:00Z","title":"Breaking Class Barriers: Efficient Dataset Distillation via Inter-Class\n Feature Compensator","summary":" Dataset distillation has emerged as a technique aiming to condense\ninformative features from large, natural datasets into a compact and synthetic\nform. While recent advancements have refined this technique, its performance is\nbottlenecked by the prevailing class-specific synthesis paradigm. Under this\nparadigm, synthetic data is optimized exclusively for a pre-assigned one-hot\nlabel, creating an implicit class barrier in feature condensation. This leads\nto inefficient utilization of the distillation budget and oversight of\ninter-class feature distributions, which ultimately limits the effectiveness\nand efficiency, as demonstrated in our analysis. To overcome these constraints,\nthis paper presents the Inter-class Feature Compensator (INFER), an innovative\ndistillation approach that transcends the class-specific data-label framework\nwidely utilized in current dataset distillation methods. Specifically, INFER\nleverages a Universal Feature Compensator (UFC) to enhance feature integration\nacross classes, enabling the generation of multiple additional synthetic\ninstances from a single UFC input. This significantly improves the efficiency\nof the distillation budget. Moreover, INFER enriches inter-class interactions\nduring the distillation, thereby enhancing the effectiveness and\ngeneralizability of the distilled data. By allowing for the linear\ninterpolation of labels similar to those in the original dataset, INFER\nmeticulously optimizes the synthetic data and dramatically reduces the size of\nsoft labels in the synthetic dataset to almost zero, establishing a new\nbenchmark for efficiency and effectiveness in dataset distillation. In\npractice, INFER demonstrates state-of-the-art performance across benchmark\ndatasets. For instance, in the ipc = 50 setting on ImageNet-1k with the same\ncompression level, it outperforms SRe2L by 34.5% using ResNet18.\n","authors":["Xin Zhang","Jiawei Du","Ping Liu","Joey Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.06927v3.pdf","comment":"Accepted to ICLR 2025"},{"id":"http://arxiv.org/abs/2503.03259v1","updated":"2025-03-05T08:33:08Z","published":"2025-03-05T08:33:08Z","title":"BANet: Bilateral Aggregation Network for Mobile Stereo Matching","summary":" State-of-the-art stereo matching methods typically use costly 3D convolutions\nto aggregate a full cost volume, but their computational demands make mobile\ndeployment challenging. Directly applying 2D convolutions for cost aggregation\noften results in edge blurring, detail loss, and mismatches in textureless\nregions. Some complex operations, like deformable convolutions and iterative\nwarping, can partially alleviate this issue; however, they are not\nmobile-friendly, limiting their deployment on mobile devices. In this paper, we\npresent a novel bilateral aggregation network (BANet) for mobile stereo\nmatching that produces high-quality results with sharp edges and fine details\nusing only 2D convolutions. Specifically, we first separate the full cost\nvolume into detailed and smooth volumes using a spatial attention map, then\nperform detailed and smooth aggregations accordingly, ultimately fusing both to\nobtain the final disparity map. Additionally, to accurately identify\nhigh-frequency detailed regions and low-frequency smooth/textureless regions,\nwe propose a new scale-aware spatial attention module. Experimental results\ndemonstrate that our BANet-2D significantly outperforms other mobile-friendly\nmethods, achieving 35.3\\% higher accuracy on the KITTI 2015 leaderboard than\nMobileStereoNet-2D, with faster runtime on mobile devices. The extended 3D\nversion, BANet-3D, achieves the highest accuracy among all real-time methods on\nhigh-end GPUs. Code: \\textcolor{magenta}{https://github.com/gangweiX/BANet}.\n","authors":["Gangwei Xu","Jiaxin Liu","Xianqi Wang","Junda Cheng","Yong Deng","Jinliang Zang","Yurui Chen","Xin Yang"],"pdf_url":"https://arxiv.org/pdf/2503.03259v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2503.03256v1","updated":"2025-03-05T08:20:16Z","published":"2025-03-05T08:20:16Z","title":"BAT: Learning Event-based Optical Flow with Bidirectional Adaptive\n Temporal Correlation","summary":" Event cameras deliver visual information characterized by a high dynamic\nrange and high temporal resolution, offering significant advantages in\nestimating optical flow for complex lighting conditions and fast-moving\nobjects. Current advanced optical flow methods for event cameras largely adopt\nestablished image-based frameworks. However, the spatial sparsity of event data\nlimits their performance. In this paper, we present BAT, an innovative\nframework that estimates event-based optical flow using bidirectional adaptive\ntemporal correlation. BAT includes three novel designs: 1) a bidirectional\ntemporal correlation that transforms bidirectional temporally dense motion cues\ninto spatially dense ones, enabling accurate and spatially dense optical flow\nestimation; 2) an adaptive temporal sampling strategy for maintaining temporal\nconsistency in correlation; 3) spatially adaptive temporal motion aggregation\nto efficiently and adaptively aggregate consistent target motion features into\nadjacent motion features while suppressing inconsistent ones. Our results rank\n$1^{st}$ on the DSEC-Flow benchmark, outperforming existing state-of-the-art\nmethods by a large margin while also exhibiting sharp edges and high-quality\ndetails. Notably, our BAT can accurately predict future optical flow using only\npast events, significantly outperforming E-RAFT's warm-start approach. Code:\n\\textcolor{magenta}{https://github.com/gangweiX/BAT}.\n","authors":["Gangwei Xu","Haotong Lin","Zhaoxing Zhang","Hongcheng Luo","Haiyang Sun","Xin Yang"],"pdf_url":"https://arxiv.org/pdf/2503.03256v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2503.03255v1","updated":"2025-03-05T08:15:31Z","published":"2025-03-05T08:15:31Z","title":"Computational Analysis of Degradation Modeling in Blind Panoramic Image\n Quality Assessment","summary":" Blind panoramic image quality assessment (BPIQA) has recently brought new\nchallenge to the visual quality community, due to the complex interaction\nbetween immersive content and human behavior. Although many efforts have been\nmade to advance BPIQA from both conducting psychophysical experiments and\ndesigning performance-driven objective algorithms, \\textit{limited content} and\n\\textit{few samples} in those closed sets inevitably would result in shaky\nconclusions, thereby hindering the development of BPIQA, we refer to it as the\n\\textit{easy-database} issue. In this paper, we present a sufficient\ncomputational analysis of degradation modeling in BPIQA to thoroughly explore\nthe \\textit{easy-database issue}, where we carefully design three types of\nexperiments via investigating the gap between BPIQA and blind image quality\nassessment (BIQA), the necessity of specific design in BPIQA models, and the\ngeneralization ability of BPIQA models. From extensive experiments, we find\nthat easy databases narrow the gap between the performance of BPIQA and BIQA\nmodels, which is unconducive to the development of BPIQA. And the easy\ndatabases make the BPIQA models be closed to saturation, therefore the\neffectiveness of the associated specific designs can not be well verified.\nBesides, the BPIQA models trained on our recently proposed databases with\ncomplicated degradation show better generalization ability. Thus, we believe\nthat much more efforts are highly desired to put into BPIQA from both\nsubjective viewpoint and objective viewpoint.\n","authors":["Jiebin Yan","Ziwen Tan","Jiale Rao","Lei Wu","Yifan Zuo","Yuming Fang"],"pdf_url":"https://arxiv.org/pdf/2503.03255v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.00397v3","updated":"2025-03-05T08:09:16Z","published":"2025-03-01T08:18:11Z","title":"Floorplan-SLAM: A Real-Time, High-Accuracy, and Long-Term Multi-Session\n Point-Plane SLAM for Efficient Floorplan Reconstruction","summary":" Floorplan reconstruction provides structural priors essential for reliable\nindoor robot navigation and high-level scene understanding. However, existing\napproaches either require time-consuming offline processing with a complete\nmap, or rely on expensive sensors and substantial computational resources. To\naddress the problems, we propose Floorplan-SLAM, which incorporates floorplan\nreconstruction tightly into a multi-session SLAM system by seamlessly\ninteracting with plane extraction, pose estimation, and back-end optimization,\nachieving real-time, high-accuracy, and long-term floorplan reconstruction\nusing only a stereo camera. Specifically, we present a robust plane extraction\nalgorithm that operates in a compact plane parameter space and leverages\nspatially complementary features to accurately detect planar structures, even\nin weakly textured scenes. Furthermore, we propose a floorplan reconstruction\nmodule tightly coupled with the SLAM system, which uses continuously optimized\nplane landmarks and poses to formulate and solve a novel optimization problem,\nthereby enabling real-time incremental floorplan reconstruction. Note that by\nleveraging the map merging capability of multi-session SLAM, our method\nsupports long-term floorplan reconstruction across multiple sessions without\nredundant data collection. Experiments on the VECtor and the self-collected\ndatasets indicate that Floorplan-SLAM significantly outperforms\nstate-of-the-art methods in terms of plane extraction robustness, pose\nestimation accuracy, and floorplan reconstruction fidelity and speed, achieving\nreal-time performance at 25-45 FPS without GPU acceleration, which reduces the\nfloorplan reconstruction time for a 1000 square meters scene from over 10 hours\nto just 9.44 minutes.\n","authors":["Haolin Wang","Zeren Lv","Hao Wei","Haijiang Zhu","Yihong Wu"],"pdf_url":"https://arxiv.org/pdf/2503.00397v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12020v4","updated":"2025-03-05T08:09:07Z","published":"2024-04-18T09:16:02Z","title":"Look, Listen, and Answer: Overcoming Biases for Audio-Visual Question\n Answering","summary":" Audio-Visual Question Answering (AVQA) is a complex multi-modal reasoning\ntask, demanding intelligent systems to accurately respond to natural language\nqueries based on audio-video input pairs. Nevertheless, prevalent AVQA\napproaches are prone to overlearning dataset biases, resulting in poor\nrobustness. Furthermore, current datasets may not provide a precise diagnostic\nfor these methods. To tackle these challenges, firstly, we propose a novel\ndataset, MUSIC-AVQA-R, crafted in two steps: rephrasing questions within the\ntest split of a public dataset (MUSIC-AVQA) and subsequently introducing\ndistribution shifts to split questions. The former leads to a large, diverse\ntest space, while the latter results in a comprehensive robustness evaluation\non rare, frequent, and overall questions. Secondly, we propose a robust\narchitecture that utilizes a multifaceted cycle collaborative debiasing\nstrategy to overcome bias learning. Experimental results show that this\narchitecture achieves state-of-the-art performance on MUSIC-AVQA-R, notably\nobtaining a significant improvement of 9.32%. Extensive ablation experiments\nare conducted on the two datasets mentioned to analyze the component\neffectiveness within the debiasing strategy. Additionally, we highlight the\nlimited robustness of existing multi-modal QA methods through the evaluation on\nour dataset. We also conduct experiments combining various baselines with our\nproposed strategy on two datasets to verify its plug-and-play capability. Our\ndataset and code are available at https://github.com/reml-group/MUSIC-AVQA-R.\n","authors":["Jie Ma","Min Hu","Pinghui Wang","Wangchun Sun","Lingyun Song","Hongbin Pei","Jun Liu","Youtian Du"],"pdf_url":"https://arxiv.org/pdf/2404.12020v4.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2503.03244v1","updated":"2025-03-05T07:52:52Z","published":"2025-03-05T07:52:52Z","title":"Two-Stream Thermal Imaging Fusion for Enhanced Time of Birth Detection\n in Neonatal Care","summary":" Around 10% of newborns require some help to initiate breathing, and 5\\% need\nventilation assistance. Accurate Time of Birth (ToB) documentation is essential\nfor optimizing neonatal care, as timely interventions are vital for proper\nresuscitation. However, current clinical methods for recording ToB often rely\non manual processes, which can be prone to inaccuracies. In this study, we\npresent a novel two-stream fusion system that combines the power of image and\nvideo analysis to accurately detect the ToB from thermal recordings in the\ndelivery room and operating theater. By integrating static and dynamic streams,\nour approach captures richer birth-related spatiotemporal features, leading to\nmore robust and precise ToB estimation. We demonstrate that this synergy\nbetween data modalities enhances performance over single-stream approaches. Our\nsystem achieves 95.7% precision and 84.8% recall in detecting birth within\nshort video clips. Additionally, with the help of a score aggregation module,\nit successfully identifies ToB in 100% of test cases, with a median absolute\nerror of 2 seconds and an absolute mean deviation of 4.5 seconds compared to\nmanual annotations.\n","authors":["Jorge García-Torres","Øyvind Meinich-Bache","Sara Brunner","Siren Rettedal","Vilde Kolstad","Kjersti Engan"],"pdf_url":"https://arxiv.org/pdf/2503.03244v1.pdf","comment":"Submitted to IEEE 25th International Conference on Digital Signal\n Processing"},{"id":"http://arxiv.org/abs/2503.02357v2","updated":"2025-03-05T07:50:05Z","published":"2025-03-04T07:28:45Z","title":"Q-Eval-100K: Evaluating Visual Quality and Alignment Level for\n Text-to-Vision Content","summary":" Evaluating text-to-vision content hinges on two crucial aspects: visual\nquality and alignment. While significant progress has been made in developing\nobjective models to assess these dimensions, the performance of such models\nheavily relies on the scale and quality of human annotations. According to\nScaling Law, increasing the number of human-labeled instances follows a\npredictable pattern that enhances the performance of evaluation models.\nTherefore, we introduce a comprehensive dataset designed to Evaluate Visual\nquality and Alignment Level for text-to-vision content (Q-EVAL-100K), featuring\nthe largest collection of human-labeled Mean Opinion Scores (MOS) for the\nmentioned two aspects. The Q-EVAL-100K dataset encompasses both text-to-image\nand text-to-video models, with 960K human annotations specifically focused on\nvisual quality and alignment for 100K instances (60K images and 40K videos).\nLeveraging this dataset with context prompt, we propose Q-Eval-Score, a unified\nmodel capable of evaluating both visual quality and alignment with special\nimprovements for handling long-text prompt alignment. Experimental results\nindicate that the proposed Q-Eval-Score achieves superior performance on both\nvisual quality and alignment, with strong generalization capabilities across\nother benchmarks. These findings highlight the significant value of the\nQ-EVAL-100K dataset. Data and codes will be available at\nhttps://github.com/zzc-1998/Q-Eval.\n","authors":["Zicheng Zhang","Tengchuan Kou","Shushi Wang","Chunyi Li","Wei Sun","Wei Wang","Xiaoyu Li","Zongyu Wang","Xuezhi Cao","Xiongkuo Min","Xiaohong Liu","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2503.02357v2.pdf","comment":"Accepted to CVPR 2025"},{"id":"http://arxiv.org/abs/2503.03236v1","updated":"2025-03-05T07:29:12Z","published":"2025-03-05T07:29:12Z","title":"GenColor: Generative Color-Concept Association in Visual Design","summary":" Existing approaches for color-concept association typically rely on\nquery-based image referencing, and color extraction from image references.\nHowever, these approaches are effective only for common concepts, and are\nvulnerable to unstable image referencing and varying image conditions. Our\nformative study with designers underscores the need for primary-accent color\ncompositions and context-dependent colors (e.g., 'clear' vs. 'polluted' sky) in\ndesign. In response, we introduce a generative approach for mining semantically\nresonant colors leveraging images generated by text-to-image models. Our\ninsight is that contemporary text-to-image models can resemble visual patterns\nfrom large-scale real-world data. The framework comprises three stages: concept\ninstancing produces generative samples using diffusion models, text-guided\nimage segmentation identifies concept-relevant regions within the image, and\ncolor association extracts primarily accompanied by accent colors. Quantitative\ncomparisons with expert designs validate our approach's effectiveness, and we\ndemonstrate the applicability through cases in various design scenarios and a\ngallery.\n","authors":["Yihan Hou","Xingchen Zeng","Yusong Wang","Manling Yang","Xiaojiao Chen","Wei Zeng"],"pdf_url":"https://arxiv.org/pdf/2503.03236v1.pdf","comment":"19 pages, 16 figures. Accepted at CHI Conference on Human Factors in\n Computing Systems (CHI'25), April 26-May 1, 2025, Yokohama, Japan"},{"id":"http://arxiv.org/abs/2411.13807v3","updated":"2025-03-05T07:24:34Z","published":"2024-11-21T03:13:30Z","title":"MagicDrive-V2: High-Resolution Long Video Generation for Autonomous\n Driving with Adaptive Control","summary":" The rapid advancement of diffusion models has greatly improved video\nsynthesis, especially in controllable video generation, which is vital for\napplications like autonomous driving. Although DiT with 3D VAE has become a\nstandard framework for video generation, it introduces challenges in\ncontrollable driving video generation, especially for geometry control,\nrendering existing control methods ineffective. To address these issues, we\npropose MagicDrive-V2, a novel approach that integrates the MVDiT block and\nspatial-temporal conditional encoding to enable multi-view video generation and\nprecise geometric control. Additionally, we introduce an efficient method for\nobtaining contextual descriptions for videos to support diverse textual\ncontrol, along with a progressive training strategy using mixed video data to\nenhance training efficiency and generalizability. Consequently, MagicDrive-V2\nenables multi-view driving video synthesis with $3.3\\times$ resolution and\n$4\\times$ frame count (compared to current SOTA), rich contextual control, and\ngeometric controls. Extensive experiments demonstrate MagicDrive-V2's ability,\nunlocking broader applications in autonomous driving.\n","authors":["Ruiyuan Gao","Kai Chen","Bo Xiao","Lanqing Hong","Zhenguo Li","Qiang Xu"],"pdf_url":"https://arxiv.org/pdf/2411.13807v3.pdf","comment":"Project Website: https://flymin.github.io/magicdrive-v2/"},{"id":"http://arxiv.org/abs/2412.09601v2","updated":"2025-03-05T07:06:15Z","published":"2024-12-12T18:59:11Z","title":"TimeRefine: Temporal Grounding with Time Refining Video LLM","summary":" Video temporal grounding aims to localize relevant temporal boundaries in a\nvideo given a textual prompt. Recent work has focused on enabling Video LLMs to\nperform video temporal grounding via next-token prediction of temporal\ntimestamps. However, accurately localizing timestamps in videos remains\nchallenging for Video LLMs when relying solely on temporal token prediction.\nOur proposed TimeRefine addresses this challenge in two ways. First, instead of\ndirectly predicting the start and end timestamps, we reformulate the temporal\ngrounding task as a temporal refining task: the model first makes rough\npredictions and then refines them by predicting offsets to the target segment.\nThis refining process is repeated multiple times, through which the model\nprogressively self-improves its temporal localization accuracy. Second, to\nenhance the model's temporal perception capabilities, we incorporate an\nauxiliary prediction head that penalizes the model more if a predicted segment\ndeviates further from the ground truth, thus encouraging the model to make\ncloser and more accurate predictions. Our plug-and-play method can be\nintegrated into most LLM-based temporal grounding approaches. The experimental\nresults demonstrate that TimeRefine achieves 3.6% and 5.0% mIoU improvements on\nthe ActivityNet and Charades-STA datasets, respectively. Code and pretrained\nmodels will be released.\n","authors":["Xizi Wang","Feng Cheng","Ziyang Wang","Huiyu Wang","Md Mohaiminul Islam","Lorenzo Torresani","Mohit Bansal","Gedas Bertasius","David Crandall"],"pdf_url":"https://arxiv.org/pdf/2412.09601v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03228v1","updated":"2025-03-05T06:56:42Z","published":"2025-03-05T06:56:42Z","title":"Path-Adaptive Matting for Efficient Inference Under Various\n Computational Cost Constraints","summary":" In this paper, we explore a novel image matting task aimed at achieving\nefficient inference under various computational cost constraints, specifically\nFLOP limitations, using a single matting network. Existing matting methods\nwhich have not explored scalable architectures or path-learning strategies,\nfail to tackle this challenge. To overcome these limitations, we introduce\nPath-Adaptive Matting (PAM), a framework that dynamically adjusts network paths\nbased on image contexts and computational cost constraints. We formulate the\ntraining of the computational cost-constrained matting network as a bilevel\noptimization problem, jointly optimizing the matting network and the path\nestimator. Building on this formalization, we design a path-adaptive matting\narchitecture by incorporating path selection layers and learnable connect\nlayers to estimate optimal paths and perform efficient inference within a\nunified network. Furthermore, we propose a performance-aware path-learning\nstrategy to generate path labels online by evaluating a few paths sampled from\nthe prior distribution of optimal paths and network estimations, enabling\nrobust and efficient online path learning. Experiments on five image matting\ndatasets demonstrate that the proposed PAM framework achieves competitive\nperformance across a range of computational cost constraints.\n","authors":["Qinglin Liu","Zonglin Li","Xiaoqian Lv","Xin Sun","Ru Li","Shengping Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.03228v1.pdf","comment":"Accepted to AAAI 2025"},{"id":"http://arxiv.org/abs/2502.19908v2","updated":"2025-03-05T06:36:27Z","published":"2025-02-27T09:26:22Z","title":"CarPlanner: Consistent Auto-regressive Trajectory Planning for\n Large-scale Reinforcement Learning in Autonomous Driving","summary":" Trajectory planning is vital for autonomous driving, ensuring safe and\nefficient navigation in complex environments. While recent learning-based\nmethods, particularly reinforcement learning (RL), have shown promise in\nspecific scenarios, RL planners struggle with training inefficiencies and\nmanaging large-scale, real-world driving scenarios. In this paper, we introduce\n\\textbf{CarPlanner}, a \\textbf{C}onsistent \\textbf{a}uto-\\textbf{r}egressive\n\\textbf{Planner} that uses RL to generate multi-modal trajectories. The\nauto-regressive structure enables efficient large-scale RL training, while the\nincorporation of consistency ensures stable policy learning by maintaining\ncoherent temporal consistency across time steps. Moreover, CarPlanner employs a\ngeneration-selection framework with an expert-guided reward function and an\ninvariant-view module, simplifying RL training and enhancing policy\nperformance. Extensive analysis demonstrates that our proposed RL framework\neffectively addresses the challenges of training efficiency and performance\nenhancement, positioning CarPlanner as a promising solution for trajectory\nplanning in autonomous driving. To the best of our knowledge, we are the first\nto demonstrate that the RL-based planner can surpass both IL- and rule-based\nstate-of-the-arts (SOTAs) on the challenging large-scale real-world dataset\nnuPlan. Our proposed CarPlanner surpasses RL-, IL-, and rule-based SOTA\napproaches within this demanding dataset.\n","authors":["Dongkun Zhang","Jiaming Liang","Ke Guo","Sha Lu","Qi Wang","Rong Xiong","Zhenwei Miao","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2502.19908v2.pdf","comment":"CVPR 2025"},{"id":"http://arxiv.org/abs/2503.03222v1","updated":"2025-03-05T06:32:49Z","published":"2025-03-05T06:32:49Z","title":"Mocap-2-to-3: Lifting 2D Diffusion-Based Pretrained Models for 3D Motion\n Capture","summary":" Recovering absolute poses in the world coordinate system from monocular views\npresents significant challenges. Two primary issues arise in this context.\nFirstly, existing methods rely on 3D motion data for training, which requires\ncollection in limited environments. Acquiring such 3D labels for new actions in\na timely manner is impractical, severely restricting the model's generalization\ncapabilities. In contrast, 2D poses are far more accessible and easier to\nobtain. Secondly, estimating a person's absolute position in metric space from\na single viewpoint is inherently more complex. To address these challenges, we\nintroduce Mocap-2-to-3, a novel framework that decomposes intricate 3D motions\ninto 2D poses, leveraging 2D data to enhance 3D motion reconstruction in\ndiverse scenarios and accurately predict absolute positions in the world\ncoordinate system. We initially pretrain a single-view diffusion model with\nextensive 2D data, followed by fine-tuning a multi-view diffusion model for\nview consistency using publicly available 3D data. This strategy facilitates\nthe effective use of large-scale 2D data. Additionally, we propose an\ninnovative human motion representation that decouples local actions from global\nmovements and encodes geometric priors of the ground, ensuring the generative\nmodel learns accurate motion priors from 2D data. During inference, this allows\nfor the gradual recovery of global movements, resulting in more plausible\npositioning. We evaluate our model's performance on real-world datasets,\ndemonstrating superior accuracy in motion and absolute human positioning\ncompared to state-of-the-art methods, along with enhanced generalization and\nscalability. Our code will be made publicly available.\n","authors":["Zhumei Wang","Zechen Hu","Ruoxi Guo","Huaijin Pi","Ziyong Feng","Sida Peng","Xiaowei Zhou"],"pdf_url":"https://arxiv.org/pdf/2503.03222v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03214v1","updated":"2025-03-05T06:16:13Z","published":"2025-03-05T06:16:13Z","title":"Rice Grain Size Measurement using Image Processing","summary":" The rice grain quality can be determined from its size and chalkiness. The\ntraditional approach to measure the rice grain size involves manual inspection,\nwhich is inefficient and leads to inconsistent results. To address this issue,\nan image processing based approach is proposed and developed in this research.\nThe approach takes image of rice grains as input and outputs the number of rice\ngrains and size of each rice grain. The different steps, such as extraction of\nregion of interest, segmentation of rice grains, and sub-contours removal,\ninvolved in the proposed approach are discussed. The approach was tested on\nrice grain images captured from different height using mobile phone camera. The\nobtained results show that the proposed approach successfully detected 95\\% of\nthe rice grains and achieved 90\\% accuracy for length and width measurement.\n","authors":["Ankush Tyagi","Dhruv Motwani","Vipul K. Dabhi","Harshadkumar B. Prajapati"],"pdf_url":"https://arxiv.org/pdf/2503.03214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03206v1","updated":"2025-03-05T05:50:38Z","published":"2025-03-05T05:50:38Z","title":"An Analytical Theory of Power Law Spectral Bias in the Learning Dynamics\n of Diffusion Models","summary":" We developed an analytical framework for understanding how the learned\ndistribution evolves during diffusion model training. Leveraging the Gaussian\nequivalence principle, we derived exact solutions for the gradient-flow\ndynamics of weights in one- or two-layer linear denoiser settings with\narbitrary data. Remarkably, these solutions allowed us to derive the generated\ndistribution in closed form and its KL divergence through training. These\nanalytical results expose a pronounced power-law spectral bias, i.e., for\nweights and distributions, the convergence time of a mode follows an inverse\npower law of its variance. Empirical experiments on both Gaussian and image\ndatasets demonstrate that the power-law spectral bias remains robust even when\nusing deeper or convolutional architectures. Our results underscore the\nimportance of the data covariance in dictating the order and rate at which\ndiffusion models learn different modes of the data, providing potential\nexplanations for why earlier stopping could lead to incorrect details in image\ngenerative models.\n","authors":["Binxu Wang"],"pdf_url":"https://arxiv.org/pdf/2503.03206v1.pdf","comment":"50 pages, 10 figures. Preprint"},{"id":"http://arxiv.org/abs/2503.03204v1","updated":"2025-03-05T05:50:28Z","published":"2025-03-05T05:50:28Z","title":"Find Matching Faces Based On Face Parameters","summary":" This paper presents an innovative approach that enables the user to find\nmatching faces based on the user-selected face parameters. Through gradio-based\nuser interface, the users can interactively select the face parameters they\nwant in their desired partner. These user-selected face parameters are\ntransformed into a text prompt which is used by the Text-To-Image generation\nmodel to generate a realistic face image. Further, the generated image along\nwith the images downloaded from the Jeevansathi.com are processed through face\ndetection and feature extraction model, which results in high dimensional\nvector embedding of 512 dimensions. The vector embeddings generated from the\ndownloaded images are stored into vector database. Now, the similarity search\nis carried out between the vector embedding of generated image and the stored\nvector embeddings. As a result, it displays the top five similar faces based on\nthe user-selected face parameters. This contribution holds a significant\npotential to turn into a high-quality personalized face matching tool.\n","authors":["Setu A. Bhatt","Harshadkumar B. Prajapati","Vipul K. Dabhi","Ankush Tyagi"],"pdf_url":"https://arxiv.org/pdf/2503.03204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03202v1","updated":"2025-03-05T05:46:08Z","published":"2025-03-05T05:46:08Z","title":"Variance-Aware Loss Scheduling for Multimodal Alignment in Low-Data\n Settings","summary":" Training vision-language models for image-text alignment typically requires\nlarge datasets to achieve robust performance. In low-data scenarios, standard\ncontrastive learning can struggle to align modalities effectively due to\noverfitting and unstable training dynamics. In this paper, we propose a\nvariance-aware loss scheduling approach that dynamically adjusts the weighting\nof the contrastive loss based on the statistical variability (uncertainty) in\nthe model's alignment predictions. Using a subset of the Flickr8k image-caption\ndataset to simulate limited data conditions, we demonstrate that our approach\nimproves image-text retrieval accuracy compared to a fixed-weight baseline. We\nalso compare against other adaptive weighting strategies (using output entropy\nand cosine similarity spread) and find that variance-aware scheduling provides\nthe best overall trade-off. Qualitatively, our method yields more distinct\nmultimodal embeddings as shown by t-SNE visualizations. Moreover, in a stress\ntest with noise-injected captions and images, the variance-guided loss proves\nmore robust, maintaining higher recall when random perturbations are\nintroduced. These results highlight the benefit of adaptive loss weighting for\nmultimodal alignment in low-data regimes.\n","authors":["Sneh Pillai"],"pdf_url":"https://arxiv.org/pdf/2503.03202v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2503.03200v1","updated":"2025-03-05T05:36:26Z","published":"2025-03-05T05:36:26Z","title":"Transformer-Based Spatio-Temporal Association of Apple Fruitlets","summary":" In this paper, we present a transformer-based method to spatio-temporally\nassociate apple fruitlets in stereo-images collected on different days and from\ndifferent camera poses. State-of-the-art association methods in agriculture are\ndedicated towards matching larger crops using either high-resolution point\nclouds or temporally stable features, which are both difficult to obtain for\nsmaller fruit in the field. To address these challenges, we propose a\ntransformer-based architecture that encodes the shape and position of each\nfruitlet, and propagates and refines these features through a series of\ntransformer encoder layers with alternating self and cross-attention. We\ndemonstrate that our method is able to achieve an F1-score of 92.4% on data\ncollected in a commercial apple orchard and outperforms all baselines and\nablations.\n","authors":["Harry Freeman","George Kantor"],"pdf_url":"https://arxiv.org/pdf/2503.03200v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03196v1","updated":"2025-03-05T05:30:22Z","published":"2025-03-05T05:30:22Z","title":"SpiritSight Agent: Advanced GUI Agent with One Look","summary":" Graphical User Interface (GUI) agents show amazing abilities in assisting\nhuman-computer interaction, automating human user's navigation on digital\ndevices. An ideal GUI agent is expected to achieve high accuracy, low latency,\nand compatibility for different GUI platforms. Recent vision-based approaches\nhave shown promise by leveraging advanced Vision Language Models (VLMs). While\nthey generally meet the requirements of compatibility and low latency, these\nvision-based GUI agents tend to have low accuracy due to their limitations in\nelement grounding. To address this issue, we propose $\\textbf{SpiritSight}$, a\nvision-based, end-to-end GUI agent that excels in GUI navigation tasks across\nvarious GUI platforms. First, we create a multi-level, large-scale,\nhigh-quality GUI dataset called $\\textbf{GUI-Lasagne}$ using scalable methods,\nempowering SpiritSight with robust GUI understanding and grounding\ncapabilities. Second, we introduce the $\\textbf{Universal Block Parsing (UBP)}$\nmethod to resolve the ambiguity problem in dynamic high-resolution of visual\ninputs, further enhancing SpiritSight's ability to ground GUI objects. Through\nthese efforts, SpiritSight agent outperforms other advanced methods on diverse\nGUI benchmarks, demonstrating its superior capability and compatibility in GUI\nnavigation tasks. Models are available at\n$\\href{https://huggingface.co/SenseLLM/SpiritSight-Agent-8B}{this\\ URL}$.\n","authors":["Zhiyuan Huang","Ziming Cheng","Junting Pan","Zhaohui Hou","Mingjie Zhan"],"pdf_url":"https://arxiv.org/pdf/2503.03196v1.pdf","comment":"Paper accepted to CVPR 2025"},{"id":"http://arxiv.org/abs/2312.10892v3","updated":"2025-03-05T05:27:43Z","published":"2023-12-18T02:50:45Z","title":"Deep Learning-based MRI Reconstruction with Artificial Fourier Transform\n Network (AFTNet)","summary":" Deep complex-valued neural networks (CVNNs) provide a powerful way to\nleverage complex number operations and representations and have succeeded in\nseveral phase-based applications. However, previous networks have not fully\nexplored the impact of complex-valued networks in the frequency domain. Here,\nwe introduce a unified complex-valued deep learning framework-Artificial\nFourier Transform Network (AFTNet)-which combines domain-manifold learning and\nCVNNs. AFTNet can be readily used to solve image inverse problems in domain\ntransformation, especially for accelerated magnetic resonance imaging (MRI)\nreconstruction and other applications. While conventional methods typically\nutilize magnitude images or treat the real and imaginary components of k-space\ndata as separate channels, our approach directly processes raw k-space data in\nthe frequency domain, utilizing complex-valued operations. This allows for a\nmapping between the frequency (k-space) and image domain to be determined\nthrough cross-domain learning. We show that AFTNet achieves superior\naccelerated MRI reconstruction compared to existing approaches. Furthermore,\nour approach can be applied to various tasks, such as denoised magnetic\nresonance spectroscopy (MRS) reconstruction and datasets with various\ncontrasts. The AFTNet presented here is a valuable preprocessing component for\ndifferent preclinical studies and provides an innovative alternative for\nsolving inverse problems in imaging and spectroscopy. The code is available at:\nhttps://github.com/yanting-yang/AFT-Net.\n","authors":["Yanting Yang","Yiren Zhang","Zongyu Li","Jeffery Siyuan Tian","Matthieu Dagommer","Jia Guo"],"pdf_url":"https://arxiv.org/pdf/2312.10892v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15503v5","updated":"2025-03-05T05:14:34Z","published":"2024-08-28T03:17:40Z","title":"RoboSense: Large-scale Dataset and Benchmark for Egocentric Robot\n Perception and Navigation in Crowded and Unstructured Environments","summary":" Reliable embodied perception from an egocentric perspective is challenging\nyet essential for autonomous navigation technology of intelligent mobile\nagents. With the growing demand of social robotics, near-field scene\nunderstanding becomes an important research topic in the areas of egocentric\nperceptual tasks related to navigation in both crowded and unstructured\nenvironments. Due to the complexity of environmental conditions and difficulty\nof surrounding obstacles owing to truncation and occlusion, the perception\ncapability under this circumstance is still inferior. To further enhance the\nintelligence of mobile robots, in this paper, we setup an egocentric\nmulti-sensor data collection platform based on 3 main types of sensors (Camera,\nLiDAR and Fisheye), which supports flexible sensor configurations to enable\ndynamic sight of view from ego-perspective, capturing either near or farther\nareas. Meanwhile, a large-scale multimodal dataset is constructed, named\nRoboSense, to facilitate egocentric robot perception. Specifically, RoboSense\ncontains more than 133K synchronized data with 1.4M 3D bounding box and IDs\nannotated in the full $360^{\\circ}$ view, forming 216K trajectories across 7.6K\ntemporal sequences. It has $270\\times$ and $18\\times$ as many annotations of\nsurrounding obstacles within near ranges as the previous datasets collected for\nautonomous driving scenarios such as KITTI and nuScenes. Moreover, we define a\nnovel matching criterion for near-field 3D perception and prediction metrics.\nBased on RoboSense, we formulate 6 popular tasks to facilitate the future\nresearch development, where the detailed analysis as well as benchmarks are\nalso provided accordingly. Data desensitization measures have been conducted\nfor privacy protection.\n","authors":["Haisheng Su","Feixiang Song","Cong Ma","Wei Wu","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2408.15503v5.pdf","comment":"Accepted to CVPR2025"},{"id":"http://arxiv.org/abs/2503.03190v1","updated":"2025-03-05T05:13:53Z","published":"2025-03-05T05:13:53Z","title":"DSPNet: Dual-vision Scene Perception for Robust 3D Question Answering","summary":" 3D Question Answering (3D QA) requires the model to comprehensively\nunderstand its situated 3D scene described by the text, then reason about its\nsurrounding environment and answer a question under that situation. However,\nexisting methods usually rely on global scene perception from pure 3D point\nclouds and overlook the importance of rich local texture details from\nmulti-view images. Moreover, due to the inherent noise in camera poses and\ncomplex occlusions, there exists significant feature degradation and reduced\nfeature robustness problems when aligning 3D point cloud with multi-view\nimages. In this paper, we propose a Dual-vision Scene Perception Network\n(DSPNet), to comprehensively integrate multi-view and point cloud features to\nimprove robustness in 3D QA. Our Text-guided Multi-view Fusion (TGMF) module\nprioritizes image views that closely match the semantic content of the text. To\nadaptively fuse back-projected multi-view images with point cloud features, we\ndesign the Adaptive Dual-vision Perception (ADVP) module, enhancing 3D scene\ncomprehension. Additionally, our Multimodal Context-guided Reasoning (MCGR)\nmodule facilitates robust reasoning by integrating contextual information\nacross visual and linguistic modalities. Experimental results on SQA3D and\nScanQA datasets demonstrate the superiority of our DSPNet. Codes will be\navailable at https://github.com/LZ-CH/DSPNet.\n","authors":["Jingzhou Luo","Yang Liu","Weixing Chen","Zhen Li","Yaowei Wang","Guanbin Li","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2503.03190v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.02745v2","updated":"2025-03-05T04:49:18Z","published":"2025-03-04T16:10:42Z","title":"ArcPro: Architectural Programs for Structured 3D Abstraction of Sparse\n Points","summary":" We introduce ArcPro, a novel learning framework built on architectural\nprograms to recover structured 3D abstractions from highly sparse and\nlow-quality point clouds. Specifically, we design a domain-specific language\n(DSL) to hierarchically represent building structures as a program, which can\nbe efficiently converted into a mesh. We bridge feedforward and inverse\nprocedural modeling by using a feedforward process for training data synthesis,\nallowing the network to make reverse predictions. We train an encoder-decoder\non the points-program pairs to establish a mapping from unstructured point\nclouds to architectural programs, where a 3D convolutional encoder extracts\npoint cloud features and a transformer decoder autoregressively predicts the\nprograms in a tokenized form. Inference by our method is highly efficient and\nproduces plausible and faithful 3D abstractions. Comprehensive experiments\ndemonstrate that ArcPro outperforms both traditional architectural proxy\nreconstruction and learning-based abstraction methods. We further explore its\npotential to work with multi-view image and natural language inputs.\n","authors":["Qirui Huang","Runze Zhang","Kangjun Liu","Minglun Gong","Hao Zhang","Hui Huang"],"pdf_url":"https://arxiv.org/pdf/2503.02745v2.pdf","comment":"CVPR 2025 (Patent Protected); Project page:\n https://vcc.tech/research/2025/ArcPro"},{"id":"http://arxiv.org/abs/2410.03030v2","updated":"2025-03-05T04:37:07Z","published":"2024-10-03T22:24:54Z","title":"Dynamic Sparse Training versus Dense Training: The Unexpected Winner in\n Image Corruption Robustness","summary":" It is generally perceived that Dynamic Sparse Training opens the door to a\nnew era of scalability and efficiency for artificial neural networks at,\nperhaps, some costs in accuracy performance for the classification task. At the\nsame time, Dense Training is widely accepted as being the \"de facto\" approach\nto train artificial neural networks if one would like to maximize their\nrobustness against image corruption. In this paper, we question this general\npractice. Consequently, we claim that, contrary to what is commonly thought,\nthe Dynamic Sparse Training methods can consistently outperform Dense Training\nin terms of robustness accuracy, particularly if the efficiency aspect is not\nconsidered as a main objective (i.e., sparsity levels between 10% and up to\n50%), without adding (or even reducing) resource cost. We validate our claim on\ntwo types of data, images and videos, using several traditional and modern deep\nlearning architectures for computer vision and three widely studied Dynamic\nSparse Training algorithms. Our findings reveal a new yet-unknown benefit of\nDynamic Sparse Training and open new possibilities in improving deep learning\nrobustness beyond the current state of the art.\n","authors":["Boqian Wu","Qiao Xiao","Shunxin Wang","Nicola Strisciuglio","Mykola Pechenizkiy","Maurice van Keulen","Decebal Constantin Mocanu","Elena Mocanu"],"pdf_url":"https://arxiv.org/pdf/2410.03030v2.pdf","comment":"Accepted at ICLR 2025"},{"id":"http://arxiv.org/abs/2502.00931v3","updated":"2025-03-05T04:11:08Z","published":"2025-02-02T21:44:15Z","title":"VL-Nav: Real-time Vision-Language Navigation with Spatial Reasoning","summary":" Vision-language navigation in unknown environments is crucial for mobile\nrobots. In scenarios such as household assistance and rescue, mobile robots\nneed to understand a human command, such as \"find a person wearing black\". We\npresent a novel vision-language navigation (VL-Nav) system that integrates\nefficient spatial reasoning on low-power robots. Unlike prior methods that rely\non a single image-level feature similarity to guide a robot, our method\nintegrates pixel-wise vision-language features with curiosity-driven\nexploration. This approach enables robust navigation to human-instructed\ninstances across diverse environments. We deploy VL-Nav on a four-wheel mobile\nrobot and evaluate its performance through comprehensive navigation tasks in\nboth indoor and outdoor environments, spanning different scales and semantic\ncomplexities. Remarkably, VL-Nav operates at a real-time frequency of 30 Hz\nwith a Jetson Orin NX, highlighting its ability to conduct efficient\nvision-language navigation. Results show that VL-Nav achieves an overall\nsuccess rate of 86.3%, outperforming previous methods by 44.15%.\n","authors":["Yi Du","Taimeng Fu","Zhuoqun Chen","Bowen Li","Shaoshu Su","Zhipeng Zhao","Chen Wang"],"pdf_url":"https://arxiv.org/pdf/2502.00931v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.01193v3","updated":"2025-03-05T03:54:00Z","published":"2025-03-03T05:38:57Z","title":"Near-infrared Image Deblurring and Event Denoising with Synergistic\n Neuromorphic Imaging","summary":" The fields of imaging in the nighttime dynamic and other extremely dark\nconditions have seen impressive and transformative advancements in recent\nyears, partly driven by the rise of novel sensing approaches, e.g.,\nnear-infrared (NIR) cameras with high sensitivity and event cameras with\nminimal blur. However, inappropriate exposure ratios of near-infrared cameras\nmake them susceptible to distortion and blur. Event cameras are also highly\nsensitive to weak signals at night yet prone to interference, often generating\nsubstantial noise and significantly degrading observations and analysis.\nHerein, we develop a new framework for low-light imaging combined with NIR\nimaging and event-based techniques, named synergistic neuromorphic imaging,\nwhich can jointly achieve NIR image deblurring and event denoising. Harnessing\ncross-modal features of NIR images and visible events via spectral consistency\nand higher-order interaction, the NIR images and events are simultaneously\nfused, enhanced, and bootstrapped. Experiments on real and realistically\nsimulated sequences demonstrate the effectiveness of our method and indicate\nbetter accuracy and robustness than other methods in practical scenarios. This\nstudy gives impetus to enhance both NIR images and events, which paves the way\nfor high-fidelity low-light imaging and neuromorphic reasoning.\n","authors":["Chao Qu","Shuo Zhu","Yuhang Wang","Zongze Wu","Xiaoyu Chen","Edmund Y. Lam","Jing Han"],"pdf_url":"https://arxiv.org/pdf/2503.01193v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03148v1","updated":"2025-03-05T03:42:59Z","published":"2025-03-05T03:42:59Z","title":"Partial Convolution Meets Visual Attention","summary":" Designing an efficient and effective neural network has remained a prominent\ntopic in computer vision research. Depthwise onvolution (DWConv) is widely used\nin efficient CNNs or ViTs, but it needs frequent memory access during\ninference, which leads to low throughput. FasterNet attempts to introduce\npartial convolution (PConv) as an alternative to DWConv but compromises the\naccuracy due to underutilized channels. To remedy this shortcoming and consider\nthe redundancy between feature map channels, we introduce a novel Partial\nvisual ATtention mechanism (PAT) that can efficiently combine PConv with visual\nattention. Our exploration indicates that the partial attention mechanism can\ncompletely replace the full attention mechanism and reduce model parameters and\nFLOPs. Our PAT can derive three types of blocks: Partial Channel-Attention\nblock (PAT_ch), Partial Spatial-Attention block (PAT_sp) and Partial\nSelf-Attention block (PAT_sf). First, PAT_ch integrates the enhanced Gaussian\nchannel attention mechanism to infuse global distribution information into the\nuntouched channels of PConv. Second, we introduce the spatial-wise attention to\nthe MLP layer to further improve model accuracy. Finally, we replace PAT_ch in\nthe last stage with the self-attention mechanism to extend the global receptive\nfield. Building upon PAT, we propose a novel hybrid network family, named\nPATNet, which achieves superior top-1 accuracy and inference speed compared to\nFasterNet on ImageNet-1K classification and excel in both detection and\nsegmentation on the COCO dataset. Particularly, our PATNet-T2 achieves 1.3%\nhigher accuracy than FasterNet-T2, while exhibiting 25% higher GPU throughput\nand 24% lower CPU latency.\n","authors":["Haiduo Huang","Fuwei Yang","Dong Li","Ji Liu","Lu Tian","Jinzhang Peng","Pengju Ren","Emad Barsoum"],"pdf_url":"https://arxiv.org/pdf/2503.03148v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2502.01303"},{"id":"http://arxiv.org/abs/2503.02689v2","updated":"2025-03-05T03:41:41Z","published":"2025-03-04T15:02:32Z","title":"STAA-SNN: Spatial-Temporal Attention Aggregator for Spiking Neural\n Networks","summary":" Spiking Neural Networks (SNNs) have gained significant attention due to their\nbiological plausibility and energy efficiency, making them promising\nalternatives to Artificial Neural Networks (ANNs). However, the performance gap\nbetween SNNs and ANNs remains a substantial challenge hindering the widespread\nadoption of SNNs. In this paper, we propose a Spatial-Temporal Attention\nAggregator SNN (STAA-SNN) framework, which dynamically focuses on and captures\nboth spatial and temporal dependencies. First, we introduce a spike-driven\nself-attention mechanism specifically designed for SNNs. Additionally, we\npioneeringly incorporate position encoding to integrate latent temporal\nrelationships into the incoming features. For spatial-temporal information\naggregation, we employ step attention to selectively amplify relevant features\nat different steps. Finally, we implement a time-step random dropout strategy\nto avoid local optima. As a result, STAA-SNN effectively captures both spatial\nand temporal dependencies, enabling the model to analyze complex patterns and\nmake accurate predictions. The framework demonstrates exceptional performance\nacross diverse datasets and exhibits strong generalization capabilities.\nNotably, STAA-SNN achieves state-of-the-art results on neuromorphic datasets\nCIFAR10-DVS, with remarkable performances of 97.14%, 82.05% and 70.40% on the\nstatic datasets CIFAR-10, CIFAR-100 and ImageNet, respectively. Furthermore,\nour model exhibits improved performance ranging from 0.33\\% to 2.80\\% with\nfewer time steps. The code for the model is available on GitHub.\n","authors":["Tianqing Zhang","Kairong Yu","Xian Zhong","Hongwei Wang","Qi Xu","Qiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.02689v2.pdf","comment":"Accepted by CVPR 2025"},{"id":"http://arxiv.org/abs/2503.03144v1","updated":"2025-03-05T03:37:41Z","published":"2025-03-05T03:37:41Z","title":"Temporal Separation with Entropy Regularization for Knowledge\n Distillation in Spiking Neural Networks","summary":" Spiking Neural Networks (SNNs), inspired by the human brain, offer\nsignificant computational efficiency through discrete spike-based information\ntransfer. Despite their potential to reduce inference energy consumption, a\nperformance gap persists between SNNs and Artificial Neural Networks (ANNs),\nprimarily due to current training methods and inherent model limitations. While\nrecent research has aimed to enhance SNN learning by employing knowledge\ndistillation (KD) from ANN teacher networks, traditional distillation\ntechniques often overlook the distinctive spatiotemporal properties of SNNs,\nthus failing to fully leverage their advantages. To overcome these challenge,\nwe propose a novel logit distillation method characterized by temporal\nseparation and entropy regularization. This approach improves existing SNN\ndistillation techniques by performing distillation learning on logits across\ndifferent time steps, rather than merely on aggregated output features.\nFurthermore, the integration of entropy regularization stabilizes model\noptimization and further boosts the performance. Extensive experimental results\nindicate that our method surpasses prior SNN distillation strategies, whether\nbased on logit distillation, feature distillation, or a combination of both.\nThe code will be available on GitHub.\n","authors":["Kairong Yu","Chengting Yu","Tianqing Zhang","Xiaochen Zhao","Shu Yang","Hongwei Wang","Qiang Zhang","Qi Xu"],"pdf_url":"https://arxiv.org/pdf/2503.03144v1.pdf","comment":"Accepted by CVPR 2025"},{"id":"http://arxiv.org/abs/2503.03141v1","updated":"2025-03-05T03:31:05Z","published":"2025-03-05T03:31:05Z","title":"Implicit U-KAN2.0: Dynamic, Efficient and Interpretable Medical Image\n Segmentation","summary":" Image segmentation is a fundamental task in both image analysis and medical\napplications. State-of-the-art methods predominantly rely on encoder-decoder\narchitectures with a U-shaped design, commonly referred to as U-Net. Recent\nadvancements integrating transformers and MLPs improve performance but still\nface key limitations, such as poor interpretability, difficulty handling\nintrinsic noise, and constrained expressiveness due to discrete layer\nstructures, often lacking a solid theoretical foundation.In this work, we\nintroduce Implicit U-KAN 2.0, a novel U-Net variant that adopts a two-phase\nencoder-decoder structure. In the SONO phase, we use a second-order neural\nordinary differential equation (NODEs), called the SONO block, for a more\nefficient, expressive, and theoretically grounded modeling approach. In the\nSONO-MultiKAN phase, we integrate the second-order NODEs and MultiKAN layer as\nthe core computational block to enhance interpretability and representation\npower. Our contributions are threefold. First, U-KAN 2.0 is an implicit deep\nneural network incorporating MultiKAN and second order NODEs, improving\ninterpretability and performance while reducing computational costs. Second, we\nprovide a theoretical analysis demonstrating that the approximation ability of\nthe MultiKAN block is independent of the input dimension. Third, we conduct\nextensive experiments on a variety of 2D and a single 3D dataset, demonstrating\nthat our model consistently outperforms existing segmentation networks.\n","authors":["Chun-Wun Cheng","Yining Zhao","Yanqi Cheng","Javier Montoya","Carola-Bibiane Schönlieb","Angelica I Aviles-Rivero"],"pdf_url":"https://arxiv.org/pdf/2503.03141v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05272v2","updated":"2025-03-05T03:26:07Z","published":"2025-01-09T14:31:54Z","title":"Solving the Catastrophic Forgetting Problem in Generalized Category\n Discovery","summary":" Generalized Category Discovery (GCD) aims to identify a mix of known and\nnovel categories within unlabeled data sets, providing a more realistic setting\nfor image recognition. Essentially, GCD needs to remember existing patterns\nthoroughly to recognize novel categories. Recent state-of-the-art method SimGCD\ntransfers the knowledge from known-class data to the learning of novel classes\nthrough debiased learning. However, some patterns are catastrophically forgot\nduring adaptation and thus lead to poor performance in novel categories\nclassification. To address this issue, we propose a novel learning approach,\nLegoGCD, which is seamlessly integrated into previous methods to enhance the\ndiscrimination of novel classes while maintaining performance on previously\nencountered known classes. Specifically, we design two types of techniques\ntermed as Local Entropy Regularization (LER) and Dual-views Kullback Leibler\ndivergence constraint (DKL). The LER optimizes the distribution of potential\nknown class samples in unlabeled data, thus ensuring the preservation of\nknowledge related to known categories while learning novel classes. Meanwhile,\nDKL introduces Kullback Leibler divergence to encourage the model to produce a\nsimilar prediction distribution of two view samples from the same image. In\nthis way, it successfully avoids mismatched prediction and generates more\nreliable potential known class samples simultaneously. Extensive experiments\nvalidate that the proposed LegoGCD effectively addresses the known category\nforgetting issue across all datasets, eg, delivering a 7.74% and 2.51% accuracy\nboost on known and novel classes in CUB, respectively. Our code is available\nat: https://github.com/Cliffia123/LegoGCD.\n","authors":["Xinzi Cao","Xiawu Zheng","Guanhong Wang","Weijiang Yu","Yunhang Shen","Ke Li","Yutong Lu","Yonghong Tian"],"pdf_url":"https://arxiv.org/pdf/2501.05272v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2503.01202v3","updated":"2025-03-05T03:11:07Z","published":"2025-03-03T05:55:30Z","title":"A Multi-Sensor Fusion Approach for Rapid Orthoimage Generation in\n Large-Scale UAV Mapping","summary":" Rapid generation of large-scale orthoimages from Unmanned Aerial Vehicles\n(UAVs) has been a long-standing focus of research in the field of aerial\nmapping. A multi-sensor UAV system, integrating the Global Positioning System\n(GPS), Inertial Measurement Unit (IMU), 4D millimeter-wave radar and camera,\ncan provide an effective solution to this problem. In this paper, we utilize\nmulti-sensor data to overcome the limitations of conventional orthoimage\ngeneration methods in terms of temporal performance, system robustness, and\ngeographic reference accuracy. A prior-pose-optimized feature matching method\nis introduced to enhance matching speed and accuracy, reducing the number of\nrequired features and providing precise references for the Structure from\nMotion (SfM) process. The proposed method exhibits robustness in low-texture\nscenes like farmlands, where feature matching is difficult. Experiments show\nthat our approach achieves accurate feature matching orthoimage generation in a\nshort time. The proposed drone system effectively aids in farmland detection\nand management.\n","authors":["Jialei He","Zhihao Zhan","Zhituo Tu","Xiang Zhu","Jie Yuan"],"pdf_url":"https://arxiv.org/pdf/2503.01202v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17773v3","updated":"2025-03-05T03:07:12Z","published":"2024-07-25T05:02:39Z","title":"KiVA: Kid-inspired Visual Analogies for Testing Large Multimodal Models","summary":" This paper investigates visual analogical reasoning in large multimodal\nmodels (LMMs) compared to human adults and children. A \"visual analogy\" is an\nabstract rule inferred from one image and applied to another. While benchmarks\nexist for testing visual reasoning in LMMs, they require advanced skills and\nomit basic visual analogies that even young children can make. Inspired by\ndevelopmental psychology, we propose a new benchmark of 4,300 visual\ntransformations of everyday objects to test LMMs on visual analogical reasoning\nand compare them to children (ages three to five) and to adults. We structure\nthe evaluation into three stages: identifying what changed (e.g., color,\nnumber, etc.), how it changed (e.g., added one object), and applying the rule\nto new scenarios. Our findings show that while GPT-o1, GPT-4V, LLaVA-1.5, and\nMANTIS identify the \"what\" effectively, they struggle with quantifying the\n\"how\" and extrapolating this rule to new objects. In contrast, children and\nadults exhibit much stronger analogical reasoning at all three stages.\nAdditionally, the strongest tested model, GPT-o1, performs better in tasks\ninvolving simple surface-level visual attributes like color and size,\ncorrelating with quicker human adult response times. Conversely, more complex\ntasks such as number, rotation, and reflection, which necessitate extensive\ncognitive processing and understanding of extrinsic spatial properties in the\nphysical world, present more significant challenges. Altogether, these findings\nhighlight the limitations of training models on data that primarily consists of\n2D images and text.\n","authors":["Eunice Yiu","Maan Qraitem","Anisa Noor Majhi","Charlie Wong","Yutong Bai","Shiry Ginosar","Alison Gopnik","Kate Saenko"],"pdf_url":"https://arxiv.org/pdf/2407.17773v3.pdf","comment":"10 pages. Project website: https://ey242.github.io/kiva.github.io/.\n Benchmark and code: https://github.com/ey242/KiVA"},{"id":"http://arxiv.org/abs/2503.03132v1","updated":"2025-03-05T03:02:59Z","published":"2025-03-05T03:02:59Z","title":"Dynamic Neural Surfaces for Elastic 4D Shape Representation and Analysis","summary":" We propose a novel framework for the statistical analysis of genus-zero 4D\nsurfaces, i.e., 3D surfaces that deform and evolve over time. This problem is\nparticularly challenging due to the arbitrary parameterizations of these\nsurfaces and their varying deformation speeds, necessitating effective\nspatiotemporal registration. Traditionally, 4D surfaces are discretized, in\nspace and time, before computing their spatiotemporal registrations, geodesics,\nand statistics. However, this approach may result in suboptimal solutions and,\nas we demonstrate in this paper, is not necessary. In contrast, we treat 4D\nsurfaces as continuous functions in both space and time. We introduce Dynamic\nSpherical Neural Surfaces (D-SNS), an efficient smooth and continuous\nspatiotemporal representation for genus-0 4D surfaces. We then demonstrate how\nto perform core 4D shape analysis tasks such as spatiotemporal registration,\ngeodesics computation, and mean 4D shape estimation, directly on these\ncontinuous representations without upfront discretization and meshing. By\nintegrating neural representations with classical Riemannian geometry and\nstatistical shape analysis techniques, we provide the building blocks for\nenabling full functional shape analysis. We demonstrate the efficiency of the\nframework on 4D human and face datasets. The source code and additional results\nare available at https://4d-dsns.github.io/DSNS/.\n","authors":["Awais Nizamani","Hamid Laga","Guanjin Wang","Farid Boussaid","Mohammed Bennamoun","Anuj Srivastava"],"pdf_url":"https://arxiv.org/pdf/2503.03132v1.pdf","comment":"22 pages, 23 figures, conference paper"},{"id":"http://arxiv.org/abs/2412.04814v3","updated":"2025-03-05T02:43:42Z","published":"2024-12-06T07:16:14Z","title":"LiFT: Leveraging Human Feedback for Text-to-Video Model Alignment","summary":" Recent advances in text-to-video (T2V) generative models have shown\nimpressive capabilities. However, these models are still inadequate in aligning\nsynthesized videos with human preferences (e.g., accurately reflecting text\ndescriptions), which is particularly difficult to address, as human preferences\nare subjective and challenging to formalize as objective functions. Existing\nstudies train video quality assessment models that rely on human-annotated\nratings for video evaluation but overlook the reasoning behind evaluations,\nlimiting their ability to capture nuanced human criteria. Moreover, aligning\nT2V model using video-based human feedback remains unexplored. Therefore, this\npaper proposes LiFT, the first method designed to leverage human feedback for\nT2V model alignment. Specifically, we first construct a Human Rating Annotation\ndataset, LiFT-HRA, consisting of approximately 10k human annotations, each\nincluding a score and its corresponding rationale. Based on this, we train a\nreward model LiFT-Critic to learn reward function effectively, which serves as\na proxy for human judgment, measuring the alignment between given videos and\nhuman expectations. Lastly, we leverage the learned reward function to align\nthe T2V model by maximizing the reward-weighted likelihood. As a case study, we\napply our pipeline to CogVideoX-2B, showing that the fine-tuned model\noutperforms the CogVideoX-5B across all 16 metrics, highlighting the potential\nof human feedback in improving the alignment and quality of synthesized videos.\n","authors":["Yibin Wang","Zhiyu Tan","Junyan Wang","Xiaomeng Yang","Cheng Jin","Hao Li"],"pdf_url":"https://arxiv.org/pdf/2412.04814v3.pdf","comment":"Project page: https://codegoat24.github.io/LiFT"},{"id":"http://arxiv.org/abs/2502.17039v2","updated":"2025-03-05T02:33:16Z","published":"2025-02-24T10:46:28Z","title":"LCV2I: Communication-Efficient and High-Performance Collaborative\n Perception Framework with Low-Resolution LiDAR","summary":" Vehicle-to-Infrastructure (V2I) collaborative perception leverages data\ncollected by infrastructure's sensors to enhance vehicle perceptual\ncapabilities. LiDAR, as a commonly used sensor in cooperative perception, is\nwidely equipped in intelligent vehicles and infrastructure. However, its\nsuperior performance comes with a correspondingly high cost. To achieve\nlow-cost V2I, reducing the cost of LiDAR is crucial. Therefore, we study\nadopting low-resolution LiDAR on the vehicle to minimize cost as much as\npossible. However, simply reducing the resolution of vehicle's LiDAR results in\nsparse point clouds, making distant small objects even more blurred.\nAdditionally, traditional communication methods have relatively low bandwidth\nutilization efficiency. These factors pose challenges for us. To balance cost\nand perceptual accuracy, we propose a new collaborative perception framework,\nnamely LCV2I. LCV2I uses data collected from cameras and low-resolution LiDAR\nas input. It also employs feature offset correction modules and regional\nfeature enhancement algorithms to improve feature representation. Finally, we\nuse regional difference map and regional score map to assess the value of\ncollaboration content, thereby improving communication bandwidth efficiency. In\nsummary, our approach achieves high perceptual performance while substantially\nreducing the demand for high-resolution sensors on the vehicle. To evaluate\nthis algorithm, we conduct 3D object detection in the real-world scenario of\nDAIR-V2X, demonstrating that the performance of LCV2I consistently surpasses\ncurrently existing algorithms.\n","authors":["Xinxin Feng","Haoran Sun","Haifeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2502.17039v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16226v4","updated":"2025-03-05T02:30:54Z","published":"2024-05-25T13:34:16Z","title":"Detecting Adversarial Data using Perturbation Forgery","summary":" As a defense strategy against adversarial attacks, adversarial detection aims\nto identify and filter out adversarial data from the data flow based on\ndiscrepancies in distribution and noise patterns between natural and\nadversarial data. Although previous detection methods achieve high performance\nin detecting gradient-based adversarial attacks, new attacks based on\ngenerative models with imbalanced and anisotropic noise patterns evade\ndetection. Even worse, the significant inference time overhead and limited\nperformance against unseen attacks make existing techniques impractical for\nreal-world use. In this paper, we explore the proximity relationship among\nadversarial noise distributions and demonstrate the existence of an open\ncovering for these distributions. By training on the open covering of\nadversarial noise distributions, a detector with strong generalization\nperformance against various types of unseen attacks can be developed. Based on\nthis insight, we heuristically propose Perturbation Forgery, which includes\nnoise distribution perturbation, sparse mask generation, and pseudo-adversarial\ndata production, to train an adversarial detector capable of detecting any\nunseen gradient-based, generative-based, and physical adversarial attacks.\nComprehensive experiments conducted on multiple general and facial datasets,\nwith a wide spectrum of attacks, validate the strong generalization of our\nmethod.\n","authors":["Qian Wang","Chen Li","Yuchen Luo","Hefei Ling","Shijuan Huang","Ruoxi Jia","Ning Yu"],"pdf_url":"https://arxiv.org/pdf/2405.16226v4.pdf","comment":"Accepted as a conference paper at CVPR 2025"},{"id":"http://arxiv.org/abs/2503.03115v1","updated":"2025-03-05T02:24:13Z","published":"2025-03-05T02:24:13Z","title":"NTR-Gaussian: Nighttime Dynamic Thermal Reconstruction with 4D Gaussian\n Splatting Based on Thermodynamics","summary":" Thermal infrared imaging offers the advantage of all-weather capability,\nenabling non-intrusive measurement of an object's surface temperature.\nConsequently, thermal infrared images are employed to reconstruct 3D models\nthat accurately reflect the temperature distribution of a scene, aiding in\napplications such as building monitoring and energy management. However,\nexisting approaches predominantly focus on static 3D reconstruction for a\nsingle time period, overlooking the impact of environmental factors on thermal\nradiation and failing to predict or analyze temperature variations over time.\nTo address these challenges, we propose the NTR-Gaussian method, which treats\ntemperature as a form of thermal radiation, incorporating elements like\nconvective heat transfer and radiative heat dissipation. Our approach utilizes\nneural networks to predict thermodynamic parameters such as emissivity,\nconvective heat transfer coefficient, and heat capacity. By integrating these\npredictions, we can accurately forecast thermal temperatures at various times\nthroughout a nighttime scene. Furthermore, we introduce a dynamic dataset\nspecifically for nighttime thermal imagery. Extensive experiments and\nevaluations demonstrate that NTR-Gaussian significantly outperforms comparison\nmethods in thermal reconstruction, achieving a predicted temperature error\nwithin 1 degree Celsius.\n","authors":["Kun Yang","Yuxiang Liu","Zeyu Cui","Yu Liu","Maojun Zhang","Shen Yan","Qing Wang"],"pdf_url":"https://arxiv.org/pdf/2503.03115v1.pdf","comment":"IEEE Conference on Computer Vision and Pattern Recognition 2025"},{"id":"http://arxiv.org/abs/2503.02593v2","updated":"2025-03-05T02:11:25Z","published":"2025-03-04T13:17:17Z","title":"CMMLoc: Advancing Text-to-PointCloud Localization with\n Cauchy-Mixture-Model Based Framework","summary":" The goal of point cloud localization based on linguistic description is to\nidentify a 3D position using textual description in large urban environments,\nwhich has potential applications in various fields, such as determining the\nlocation for vehicle pickup or goods delivery. Ideally, for a textual\ndescription and its corresponding 3D location, the objects around the 3D\nlocation should be fully described in the text description. However, in\npractical scenarios, e.g., vehicle pickup, passengers usually describe only the\npart of the most significant and nearby surroundings instead of the entire\nenvironment. In response to this $\\textbf{partially relevant}$ challenge, we\npropose $\\textbf{CMMLoc}$, an uncertainty-aware\n$\\textbf{C}$auchy-$\\textbf{M}$ixture-$\\textbf{M}$odel ($\\textbf{CMM}$) based\nframework for text-to-point-cloud $\\textbf{Loc}$alization. To model the\nuncertain semantic relations between text and point cloud, we integrate CMM\nconstraints as a prior during the interaction between the two modalities. We\nfurther design a spatial consolidation scheme to enable adaptive aggregation of\ndifferent 3D objects with varying receptive fields. To achieve precise\nlocalization, we propose a cardinal direction integration module alongside a\nmodality pre-alignment strategy, helping capture the spatial relationships\namong objects and bringing the 3D objects closer to the text modality.\nComprehensive experiments validate that CMMLoc outperforms existing methods,\nachieving state-of-the-art results on the KITTI360Pose dataset. Codes are\navailable in this GitHub repository https://github.com/kevin301342/CMMLoc.\n","authors":["Yanlong Xu","Haoxuan Qu","Jun Liu","Wenxiao Zhang","Xun Yang"],"pdf_url":"https://arxiv.org/pdf/2503.02593v2.pdf","comment":"Accepted by CVPR 2025"},{"id":"http://arxiv.org/abs/2503.03111v1","updated":"2025-03-05T02:10:14Z","published":"2025-03-05T02:10:14Z","title":"An Improved Pure Fully Connected Neural Network for Rice Grain\n Classification","summary":" Rice is a staple food for a significant portion of the world's population,\nproviding essential nutrients and serving as a versatile in-gredient in a wide\nrange of culinary traditions. Recently, the use of deep learning has enabled\nautomated classification of rice, im-proving accuracy and efficiency. However,\nclassical models based on first-stage training may face difficulties in\ndistinguishing between rice varieties with similar external characteristics,\nthus leading to misclassifications. Considering the transparency and\nfeasibility of model, we selected and gradually improved pure fully connected\nneural network to achieve classification of rice grain. The dataset we used\ncontains both global and domestic rice images obtained from websites and\nlaboratories respectively. First, the training mode was changed from one-stage\ntraining to two-stage training, which significantly contributes to\ndistinguishing two similar types of rice. Secondly, the preprocessing method\nwas changed from random tilting to horizontal or vertical position cor-rection.\nAfter those two enhancements, the accuracy of our model increased notably from\n97% to 99%. In summary, two subtle methods proposed in this study can\nremarkably enhance the classification ability of deep learning models in terms\nof the classification of rice grain.\n","authors":["Wanke Xia","Ruoxin Peng","Haoqi Chu","Xinlei Zhu"],"pdf_url":"https://arxiv.org/pdf/2503.03111v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03110v1","updated":"2025-03-05T02:10:04Z","published":"2025-03-05T02:10:04Z","title":"WarmFed: Federated Learning with Warm-Start for Globalization and\n Personalization Via Personalized Diffusion Models","summary":" Federated Learning (FL) stands as a prominent distributed learning paradigm\namong multiple clients to achieve a unified global model without privacy\nleakage. In contrast to FL, Personalized federated learning aims at serving for\neach client in achieving persoanlized model. However, previous FL frameworks\nhave grappled with a dilemma: the choice between developing a singular global\nmodel at the server to bolster globalization or nurturing personalized model at\nthe client to accommodate personalization. Instead of making trade-offs, this\npaper commences its discourse from the pre-trained initialization, obtaining\nresilient global information and facilitating the development of both global\nand personalized models. Specifically, we propose a novel method called WarmFed\nto achieve this. WarmFed customizes Warm-start through personalized diffusion\nmodels, which are generated by local efficient fine-tunining (LoRA). Building\nupon the Warm-Start, we advance a server-side fine-tuning strategy to derive\nthe global model, and propose a dynamic self-distillation (DSD) to procure more\nresilient personalized models simultaneously. Comprehensive experiments\nunderscore the substantial gains of our approach across both global and\npersonalized models, achieved within just one-shot and five communication(s).\n","authors":["Tao Feng","Jie Zhang","Xiangjian Li","Rong Huang","Huashan Liu","Zhijie Wang"],"pdf_url":"https://arxiv.org/pdf/2503.03110v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15050v4","updated":"2025-03-05T02:09:23Z","published":"2024-12-19T16:57:45Z","title":"Uni-Renderer: Unifying Rendering and Inverse Rendering Via Dual Stream\n Diffusion","summary":" Rendering and inverse rendering are pivotal tasks in both computer vision and\ngraphics. The rendering equation is the core of the two tasks, as an ideal\nconditional distribution transfer function from intrinsic properties to RGB\nimages. Despite achieving promising results of existing rendering methods, they\nmerely approximate the ideal estimation for a specific scene and come with a\nhigh computational cost. Additionally, the inverse conditional distribution\ntransfer is intractable due to the inherent ambiguity. To address these\nchallenges, we propose a data-driven method that jointly models rendering and\ninverse rendering as two conditional generation tasks within a single diffusion\nframework. Inspired by UniDiffuser, we utilize two distinct time schedules to\nmodel both tasks, and with a tailored dual streaming module, we achieve\ncross-conditioning of two pre-trained diffusion models. This unified approach,\nnamed Uni-Renderer, allows the two processes to facilitate each other through a\ncycle-consistent constrain, mitigating ambiguity by enforcing consistency\nbetween intrinsic properties and rendered images. Combined with a meticulously\nprepared dataset, our method effectively decomposition of intrinsic properties\nand demonstrates a strong capability to recognize changes during rendering. We\nwill open-source our training and inference code to the public, fostering\nfurther research and development in this area.\n","authors":["Zhifei Chen","Tianshuo Xu","Wenhang Ge","Leyi Wu","Dongyu Yan","Jing He","Luozhou Wang","Lu Zeng","Shunsi Zhang","Yingcong Chen"],"pdf_url":"https://arxiv.org/pdf/2412.15050v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.01582v2","updated":"2025-03-05T02:02:19Z","published":"2025-03-03T14:23:37Z","title":"Category-level Meta-learned NeRF Priors for Efficient Object Mapping","summary":" In 3D object mapping, category-level priors enable efficient object\nreconstruction and canonical pose estimation, requiring only a single prior per\nsemantic category (e.g., chair, book, laptop). Recently, DeepSDF has\npredominantly been used as a category-level shape prior, but it struggles to\nreconstruct sharp geometry and is computationally expensive. In contrast, NeRFs\ncapture fine details but have yet to be effectively integrated with\ncategory-level priors in a real-time multi-object mapping framework. To bridge\nthis gap, we introduce PRENOM, a Prior-based Efficient Neural Object Mapper\nthat integrates category-level priors with object-level NeRFs to enhance\nreconstruction efficiency while enabling canonical object pose estimation.\nPRENOM gets to know objects on a first-name basis by meta-learning on synthetic\nreconstruction tasks generated from open-source shape datasets. To account for\nobject category variations, it employs a multi-objective genetic algorithm to\noptimize the NeRF architecture for each category, balancing reconstruction\nquality and training time. Additionally, prior-based probabilistic ray sampling\ndirects sampling toward expected object regions, accelerating convergence and\nimproving reconstruction quality under constrained resources. Experimental\nresults on a low-end GPU highlight the ability of PRENOM to achieve\nhigh-quality reconstructions while maintaining computational feasibility.\nSpecifically, comparisons with prior-free NeRF-based approaches on a synthetic\ndataset show a 21% lower Chamfer distance, demonstrating better reconstruction\nquality. Furthermore, evaluations against other approaches using shape priors\non a noisy real-world dataset indicate a 13% improvement averaged across all\nreconstruction metrics, and comparable pose and size estimation accuracy, while\nbeing trained for 5x less time.\n","authors":["Saad Ejaz","Hriday Bavle","Laura Ribeiro","Holger Voos","Jose Luis Sanchez-Lopez"],"pdf_url":"https://arxiv.org/pdf/2503.01582v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17859v3","updated":"2025-03-05T01:48:25Z","published":"2024-05-28T06:16:57Z","title":"Adapting Pre-Trained Vision Models for Novel Instance Detection and\n Segmentation","summary":" Novel Instance Detection and Segmentation (NIDS) aims at detecting and\nsegmenting novel object instances given a few examples of each instance. We\npropose a unified, simple, yet effective framework (NIDS-Net) comprising object\nproposal generation, embedding creation for both instance templates and\nproposal regions, and embedding matching for instance label assignment.\nLeveraging recent advancements in large vision methods, we utilize Grounding\nDINO and Segment Anything Model (SAM) to obtain object proposals with accurate\nbounding boxes and masks. Central to our approach is the generation of\nhigh-quality instance embeddings. We utilized foreground feature averages of\npatch embeddings from the DINOv2 ViT backbone, followed by refinement through a\nweight adapter mechanism that we introduce.\n We show experimentally that our weight adapter can adjust the embeddings\nlocally within their feature space and effectively limit overfitting in the\nfew-shot setting. Furthermore, the weight adapter optimizes weights to enhance\nthe distinctiveness of instance embeddings during similarity computation. This\nmethodology enables a straightforward matching strategy that results in\nsignificant performance gains. Our framework surpasses current state-of-the-art\nmethods, demonstrating notable improvements in four detection datasets. In the\nsegmentation tasks on seven core datasets of the BOP challenge, our method\noutperforms the leading published RGB methods and remains competitive with the\nbest RGB-D method. We have also verified our method using real-world images\nfrom a Fetch robot and a RealSense camera. Project Page:\nhttps://irvlutd.github.io/NIDSNet/\n","authors":["Yangxiao Lu","Jishnu Jaykumar P","Yunhui Guo","Nicholas Ruozzi","Yu Xiang"],"pdf_url":"https://arxiv.org/pdf/2405.17859v3.pdf","comment":"Project Page: https://irvlutd.github.io/NIDSNet/"},{"id":"http://arxiv.org/abs/2503.03104v1","updated":"2025-03-05T01:41:59Z","published":"2025-03-05T01:41:59Z","title":"RVAFM: Re-parameterizing Vertical Attention Fusion Module for\n Handwritten Paragraph Text Recognition","summary":" Handwritten Paragraph Text Recognition (HPTR) is a challenging task in\nComputer Vision, requiring the transformation of a paragraph text image, rich\nin handwritten text, into text encoding sequences. One of the most advanced\nmodels for this task is Vertical Attention Network (VAN), which utilizes a\nVertical Attention Module (VAM) to implicitly segment paragraph text images\ninto text lines, thereby reducing the difficulty of the recognition task.\nHowever, from a network structure perspective, VAM is a single-branch module,\nwhich is less effective in learning compared to multi-branch modules. In this\npaper, we propose a new module, named Re-parameterizing Vertical Attention\nFusion Module (RVAFM), which incorporates structural re-parameterization\ntechniques. RVAFM decouples the structure of the module during training and\ninference stages. During training, it uses a multi-branch structure for more\neffective learning, and during inference, it uses a single-branch structure for\nfaster processing. The features learned by the multi-branch structure are fused\ninto the single-branch structure through a special fusion method named\nRe-parameterization Fusion (RF) without any loss of information. As a result,\nwe achieve a Character Error Rate (CER) of 4.44% and a Word Error Rate (WER) of\n14.37% on the IAM paragraph-level test set. Additionally, the inference speed\nis slightly faster than VAN.\n","authors":["Jinhui Zheng","Zhiquan Liu","Yain-Whar Si","Jianqing Li","Xinyuan Zhang","Xiaofan Li","Haozhi Huang","Xueyuan Gong"],"pdf_url":"https://arxiv.org/pdf/2503.03104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.13524v3","updated":"2025-03-05T01:21:38Z","published":"2025-02-19T08:21:59Z","title":"MobileViM: A Light-weight and Dimension-independent Vision Mamba for 3D\n Medical Image Analysis","summary":" Efficient evaluation of three-dimensional (3D) medical images is crucial for\ndiagnostic and therapeutic practices in healthcare. Recent years have seen a\nsubstantial uptake in applying deep learning and computer vision to analyse and\ninterpret medical images. Traditional approaches, such as convolutional neural\nnetworks (CNNs) and vision transformers (ViTs), face significant computational\nchallenges, prompting the need for architectural advancements. Recent efforts\nhave led to the introduction of novel architectures like the ``Mamba'' model as\nalternative solutions to traditional CNNs or ViTs. The Mamba model excels in\nthe linear processing of one-dimensional data with low computational demands.\nHowever, Mamba's potential for 3D medical image analysis remains underexplored\nand could face significant computational challenges as the dimension increases.\nThis manuscript presents MobileViM, a streamlined architecture for efficient\nsegmentation of 3D medical images. In the MobileViM network, we invent a new\ndimension-independent mechanism and a dual-direction traversing approach to\nincorporate with a vision-Mamba-based framework. MobileViM also features a\ncross-scale bridging technique to improve efficiency and accuracy across\nvarious medical imaging modalities. With these enhancements, MobileViM achieves\nsegmentation speeds exceeding 90 frames per second (FPS) on a single graphics\nprocessing unit (i.e., NVIDIA RTX 4090). This performance is over 24 FPS faster\nthan the state-of-the-art deep learning models for processing 3D images with\nthe same computational resources. In addition, experimental evaluations\ndemonstrate that MobileViM delivers superior performance, with Dice similarity\nscores reaching 92.72%, 86.69%, 80.46%, and 77.43% for PENGWIN, BraTS2024,\nATLAS, and Toothfairy2 datasets, respectively, which significantly surpasses\nexisting models.\n","authors":["Wei Dai","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2502.13524v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.09795v2","updated":"2025-03-05T01:09:24Z","published":"2025-02-13T22:10:21Z","title":"Vision-based Geo-Localization of Future Mars Rotorcraft in Challenging\n Illumination Conditions","summary":" Planetary exploration using aerial assets has the potential for unprecedented\nscientific discoveries on Mars. While NASA's Mars helicopter Ingenuity proved\nflight in Martian atmosphere is possible, future Mars rotocrafts will require\nadvanced navigation capabilities for long-range flights. One such critical\ncapability is Map-based Localization (MbL) which registers an onboard image to\na reference map during flight in order to mitigate cumulative drift from visual\nodometry. However, significant illumination differences between rotocraft\nobservations and a reference map prove challenging for traditional MbL systems,\nrestricting the operational window of the vehicle. In this work, we investigate\na new MbL system and propose Geo-LoFTR, a geometry-aided deep learning model\nfor image registration that is more robust under large illumination differences\nthan prior models. The system is supported by a custom simulation framework\nthat uses real orbital maps to produce large amounts of realistic images of the\nMartian terrain. Comprehensive evaluations show that our proposed system\noutperforms prior MbL efforts in terms of localization accuracy under\nsignificant lighting and scale variations. Furthermore, we demonstrate the\nvalidity of our approach across a simulated Martian day.\n","authors":["Dario Pisanti","Robert Hewitt","Roland Brockers","Georgios Georgakis"],"pdf_url":"https://arxiv.org/pdf/2502.09795v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03088v1","updated":"2025-03-05T01:04:45Z","published":"2025-03-05T01:04:45Z","title":"AHCPTQ: Accurate and Hardware-Compatible Post-Training Quantization for\n Segment Anything Model","summary":" The Segment Anything Model (SAM) has demonstrated strong versatility across\nvarious visual tasks. However, its large storage requirements and high\ncomputational cost pose challenges for practical deployment. Post-training\nquantization (PTQ) has emerged as an effective strategy for efficient\ndeployment, but we identify two key challenges in SAM that hinder the\neffectiveness of existing PTQ methods: the heavy-tailed and skewed distribution\nof post-GELU activations, and significant inter-channel variation in linear\nprojection activations. To address these challenges, we propose AHCPTQ, an\naccurate and hardware-efficient PTQ method for SAM. AHCPTQ introduces\nhardware-compatible Hybrid Log-Uniform Quantization (HLUQ) to manage post-GELU\nactivations, employing log2 quantization for dense small values and uniform\nquantization for sparse large values to enhance quantization resolution.\nAdditionally, AHCPTQ incorporates Channel-Aware Grouping (CAG) to mitigate\ninter-channel variation by progressively clustering activation channels with\nsimilar distributions, enabling them to share quantization parameters and\nimproving hardware efficiency. The combination of HLUQ and CAG not only\nenhances quantization effectiveness but also ensures compatibility with\nefficient hardware execution. For instance, under the W4A4 configuration on the\nSAM-L model, AHCPTQ achieves 36.6% mAP on instance segmentation with the DINO\ndetector, while achieving a 7.89x speedup and 8.64x energy efficiency over its\nfloating-point counterpart in FPGA implementation.\n","authors":["Wenlun Zhang","Shimpei Ando","Kentaro Yoshioka"],"pdf_url":"https://arxiv.org/pdf/2503.03088v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.01962v2","updated":"2025-03-05T00:41:20Z","published":"2024-10-02T19:10:23Z","title":"LS-HAR: Language Supervised Human Action Recognition with Salient\n Fusion, Construction Sites as a Use-Case","summary":" Detecting human actions is a crucial task for autonomous robots and vehicles,\noften requiring the integration of various data modalities for improved\naccuracy. In this study, we introduce a novel approach to Human Action\nRecognition (HAR) using language supervision named LS-HAR based on skeleton and\nvisual cues. Our method leverages a language model to guide the feature\nextraction process in the skeleton encoder. Specifically, we employ learnable\nprompts for the language model conditioned on the skeleton modality to optimize\nfeature representation. Furthermore, we propose a fusion mechanism that\ncombines dual-modality features using a salient fusion module, incorporating\nattention and transformer mechanisms to address the modalities' high\ndimensionality. This fusion process prioritizes informative video frames and\nbody joints, enhancing the recognition accuracy of human actions. Additionally,\nwe introduce a new dataset tailored for real-world robotic applications in\nconstruction sites, featuring visual, skeleton, and depth data modalities,\nnamed VolvoConstAct. This dataset serves to facilitate the training and\nevaluation of machine learning models to instruct autonomous construction\nmachines for performing necessary tasks in real-world construction sites. To\nevaluate our approach, we conduct experiments on our dataset as well as three\nwidely used public datasets: NTU-RGB+D, NTU-RGB+D 120, and NW-UCLA. Results\nreveal that our proposed method achieves promising performance across all\ndatasets, demonstrating its robustness and potential for various applications.\nThe code, dataset, and demonstration of real-machine experiments are available\nat: https://mmahdavian.github.io/ls_har/\n","authors":["Mohammad Mahdavian","Mohammad Loni","Ted Samuelsson","Mo Chen"],"pdf_url":"https://arxiv.org/pdf/2410.01962v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17457v3","updated":"2025-03-05T00:32:49Z","published":"2024-07-24T17:50:00Z","title":"CSCPR: Cross-Source-Context Indoor RGB-D Place Recognition","summary":" We extend our previous work, PoCo, and present a new algorithm,\nCross-Source-Context Place Recognition (CSCPR), for RGB-D indoor place\nrecognition that integrates global retrieval and reranking into an end-to-end\nmodel and keeps the consistency of using Context-of-Clusters (CoCs) for feature\nprocessing. Unlike prior approaches that primarily focus on the RGB domain for\nplace recognition reranking, CSCPR is designed to handle the RGB-D data. We\napply the CoCs to handle cross-sourced and cross-scaled RGB-D point clouds and\nintroduce two novel modules for reranking: the Self-Context Cluster (SCC) and\nthe Cross Source Context Cluster (CSCC), which enhance feature representation\nand match query-database pairs based on local features, respectively. We also\nrelease two new datasets, ScanNetIPR and ARKitIPR. Our experiments demonstrate\nthat CSCPR significantly outperforms state-of-the-art models on these datasets\nby at least 29.27% in Recall@1 on the ScanNet-PR dataset and 43.24% in the new\ndatasets. Code and datasets will be released.\n","authors":["Jing Liang","Zhuo Deng","Zheming Zhou","Min Sun","Omid Ghasemalizadeh","Cheng-Hao Kuo","Arnie Sen","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2407.17457v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03074v1","updated":"2025-03-05T00:27:32Z","published":"2025-03-05T00:27:32Z","title":"BEVDriver: Leveraging BEV Maps in LLMs for Robust Closed-Loop Driving","summary":" Autonomous driving has the potential to set the stage for more efficient\nfuture mobility, requiring the research domain to establish trust through safe,\nreliable and transparent driving. Large Language Models (LLMs) possess\nreasoning capabilities and natural language understanding, presenting the\npotential to serve as generalized decision-makers for ego-motion planning that\ncan interact with humans and navigate environments designed for human drivers.\nWhile this research avenue is promising, current autonomous driving approaches\nare challenged by combining 3D spatial grounding and the reasoning and language\ncapabilities of LLMs. We introduce BEVDriver, an LLM-based model for end-to-end\nclosed-loop driving in CARLA that utilizes latent BEV features as perception\ninput. BEVDriver includes a BEV encoder to efficiently process multi-view\nimages and 3D LiDAR point clouds. Within a common latent space, the BEV\nfeatures are propagated through a Q-Former to align with natural language\ninstructions and passed to the LLM that predicts and plans precise future\ntrajectories while considering navigation instructions and critical scenarios.\nOn the LangAuto benchmark, our model reaches up to 18.9% higher performance on\nthe Driving Score compared to SoTA methods.\n","authors":["Katharina Winter","Mark Azer","Fabian B. Flohr"],"pdf_url":"https://arxiv.org/pdf/2503.03074v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2503.03068v1","updated":"2025-03-05T00:16:09Z","published":"2025-03-05T00:16:09Z","title":"Multi-View Depth Consistent Image Generation Using Generative AI Models:\n Application on Architectural Design of University Buildings","summary":" In the early stages of architectural design, shoebox models are typically\nused as a simplified representation of building structures but require\nextensive operations to transform them into detailed designs. Generative\nartificial intelligence (AI) provides a promising solution to automate this\ntransformation, but ensuring multi-view consistency remains a significant\nchallenge. To solve this issue, we propose a novel three-stage consistent image\ngeneration framework using generative AI models to generate architectural\ndesigns from shoebox model representations. The proposed method enhances\nstate-of-the-art image generation diffusion models to generate multi-view\nconsistent architectural images. We employ ControlNet as the backbone and\noptimize it to accommodate multi-view inputs of architectural shoebox models\ncaptured from predefined perspectives. To ensure stylistic and structural\nconsistency across multi-view images, we propose an image space loss module\nthat incorporates style loss, structural loss and angle alignment loss. We then\nuse depth estimation method to extract depth maps from the generated multi-view\nimages. Finally, we use the paired data of the architectural images and depth\nmaps as inputs to improve the multi-view consistency via the depth-aware 3D\nattention module. Experimental results demonstrate that the proposed framework\ncan generate multi-view architectural images with consistent style and\nstructural coherence from shoebox model inputs.\n","authors":["Xusheng Du","Ruihan Gui","Zhengyang Wang","Ye Zhang","Haoran Xie"],"pdf_url":"https://arxiv.org/pdf/2503.03068v1.pdf","comment":"10 pages, 7 figures, in Proceedings of CAADRIA2025"}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2503.03750v1","updated":"2025-03-05T18:59:23Z","published":"2025-03-05T18:59:23Z","title":"The MASK Benchmark: Disentangling Honesty From Accuracy in AI Systems","summary":" As large language models (LLMs) become more capable and agentic, the\nrequirement for trust in their outputs grows significantly, yet at the same\ntime concerns have been mounting that models may learn to lie in pursuit of\ntheir goals. To address these concerns, a body of work has emerged around the\nnotion of \"honesty\" in LLMs, along with interventions aimed at mitigating\ndeceptive behaviors. However, evaluations of honesty are currently highly\nlimited, with no benchmark combining large scale and applicability to all\nmodels. Moreover, many benchmarks claiming to measure honesty in fact simply\nmeasure accuracy--the correctness of a model's beliefs--in disguise. In this\nwork, we introduce a large-scale human-collected dataset for measuring honesty\ndirectly, allowing us to disentangle accuracy from honesty for the first time.\nAcross a diverse set of LLMs, we find that while larger models obtain higher\naccuracy on our benchmark, they do not become more honest. Surprisingly, while\nmost frontier LLMs obtain high scores on truthfulness benchmarks, we find a\nsubstantial propensity in frontier LLMs to lie when pressured to do so,\nresulting in low honesty scores on our benchmark. We find that simple methods,\nsuch as representation engineering interventions, can improve honesty. These\nresults underscore the growing need for robust evaluations and effective\ninterventions to ensure LLMs remain trustworthy.\n","authors":["Richard Ren","Arunim Agarwal","Mantas Mazeika","Cristina Menghini","Robert Vacareanu","Brad Kenstler","Mick Yang","Isabelle Barrass","Alice Gatti","Xuwang Yin","Eduardo Trevino","Matias Geralnik","Adam Khoja","Dean Lee","Summer Yue","Dan Hendrycks"],"pdf_url":"https://arxiv.org/pdf/2503.03750v1.pdf","comment":"Website: https://www.mask-benchmark.ai"},{"id":"http://arxiv.org/abs/2503.03746v1","updated":"2025-03-05T18:58:44Z","published":"2025-03-05T18:58:44Z","title":"Process-based Self-Rewarding Language Models","summary":" Large Language Models have demonstrated outstanding performance across\nvarious downstream tasks and have been widely applied in multiple scenarios.\nHuman-annotated preference data is used for training to further improve LLMs'\nperformance, which is constrained by the upper limit of human performance.\nTherefore, Self-Rewarding method has been proposed, where LLMs generate\ntraining data by rewarding their own outputs. However, the existing\nself-rewarding paradigm is not effective in mathematical reasoning scenarios\nand may even lead to a decline in performance. In this work, we propose the\nProcess-based Self-Rewarding pipeline for language models, which introduces\nlong-thought reasoning, step-wise LLM-as-a-Judge, and step-wise preference\noptimization within the self-rewarding paradigm. Our new paradigm successfully\nenhances the performance of LLMs on multiple mathematical reasoning benchmarks\nthrough iterative Process-based Self-Rewarding, demonstrating the immense\npotential of self-rewarding to achieve LLM reasoning that may surpass human\ncapabilities.\n","authors":["Shimao Zhang","Xiao Liu","Xin Zhang","Junxiao Liu","Zheheng Luo","Shujian Huang","Yeyun Gong"],"pdf_url":"https://arxiv.org/pdf/2503.03746v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03743v1","updated":"2025-03-05T18:56:16Z","published":"2025-03-05T18:56:16Z","title":"CHOP: Mobile Operating Assistant with Constrained High-frequency\n Optimized Subtask Planning","summary":" The advancement of visual language models (VLMs) has enhanced mobile device\noperations, allowing simulated human-like actions to address user requirements.\nCurrent VLM-based mobile operating assistants can be structured into three\nlevels: task, subtask, and action. The subtask level, linking high-level goals\nwith low-level executable actions, is crucial for task completion but faces two\nchallenges: ineffective subtasks that lower-level agent cannot execute and\ninefficient subtasks that fail to contribute to the completion of the\nhigher-level task. These challenges stem from VLM's lack of experience in\ndecomposing subtasks within GUI scenarios in multi-agent architecture. To\naddress these, we propose a new mobile assistant architecture with constrained\nhigh-frequency o}ptimized planning (CHOP). Our approach overcomes the VLM's\ndeficiency in GUI scenarios planning by using human-planned subtasks as the\nbasis vector. We evaluate our architecture in both English and Chinese contexts\nacross 20 Apps, demonstrating significant improvements in both effectiveness\nand efficiency. Our dataset and code is available at\nhttps://github.com/Yuqi-Zhou/CHOP\n","authors":["Yuqi Zhou","Shuai Wang","Sunhao Dai","Qinglin Jia","Zhaocheng Du","Zhenhua Dong","Jun Xu"],"pdf_url":"https://arxiv.org/pdf/2503.03743v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03733v1","updated":"2025-03-05T18:44:35Z","published":"2025-03-05T18:44:35Z","title":"Rethinking Deep Clustering Paradigms: Self-Supervision Is All You Need","summary":" The recent advances in deep clustering have been made possible by significant\nprogress in self-supervised and pseudo-supervised learning. However, the\ntrade-off between self-supervision and pseudo-supervision can give rise to\nthree primary issues. The joint training causes Feature Randomness and Feature\nDrift, whereas the independent training causes Feature Randomness and Feature\nTwist. In essence, using pseudo-labels generates random and unreliable\nfeatures. The combination of pseudo-supervision and self-supervision drifts the\nreliable clustering-oriented features. Moreover, moving from self-supervision\nto pseudo-supervision can twist the curved latent manifolds. This paper\naddresses the limitations of existing deep clustering paradigms concerning\nFeature Randomness, Feature Drift, and Feature Twist. We propose a new paradigm\nwith a new strategy that replaces pseudo-supervision with a second round of\nself-supervision training. The new strategy makes the transition between\ninstance-level self-supervision and neighborhood-level self-supervision\nsmoother and less abrupt. Moreover, it prevents the drifting effect that is\ncaused by the strong competition between instance-level self-supervision and\nclustering-level pseudo-supervision. Moreover, the absence of the\npseudo-supervision prevents the risk of generating random features. With this\nnovel approach, our paper introduces a Rethinking of the Deep Clustering\nParadigms, denoted by R-DC. Our model is specifically designed to address three\nprimary challenges encountered in Deep Clustering: Feature Randomness, Feature\nDrift, and Feature Twist. Experimental results conducted on six datasets have\nshown that the two-level self-supervision training yields substantial\nimprovements.\n","authors":["Amal Shaheena","Nairouz Mrabahb","Riadh Ksantinia","Abdulla Alqaddoumia"],"pdf_url":"https://arxiv.org/pdf/2503.03733v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07674v2","updated":"2025-03-05T18:39:05Z","published":"2025-01-13T20:13:59Z","title":"CDS: Data Synthesis Method Guided by Cognitive Diagnosis Theory","summary":" Large Language Models (LLMs) have achieved significant advancements, but the\nincreasing complexity of tasks and higher performance demands highlight the\nneed for continuous improvement. Some approaches utilize synthetic data\ngenerated by advanced LLMs based on evaluation results to train models.\nHowever, conventional evaluation methods fail to provide detailed, fine-grained\nprofiles of LLMs, limiting their guidance for data synthesis. In this paper, we\nintroduce the Cognitive Diagnostic Synthesis (CDS) method, which incorporates a\ndiagnostic process inspired by Cognitive Diagnosis Theory (CDT) to refine\nevaluation results and characterize model profiles at the knowledge component\nlevel. Based on these diagnostics, we propose two diagnosis-synthesis\nstrategies for weakness-targeted data synthesis. Additionally, we present an\nenhanced data augmentation and selection pipeline to improve the quality and\ndiversity of synthesized data. Our experiments with several open-source models\nshow significant improvements across multiple benchmarks, achieving up to 6.00%\nimprovement in code generation, 13.10% in mathematical reasoning, and 5.43% in\nacademic exams. Code and data are available on GitHub.\n","authors":["Haokun Zhao","Jinyi Han","Jiaqing Liang","Yanghua Xiao"],"pdf_url":"https://arxiv.org/pdf/2501.07674v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.07132v2","updated":"2025-03-05T18:33:41Z","published":"2025-02-10T23:50:09Z","title":"Interactive Data Harmonization with LLM Agents","summary":" Data harmonization is an essential task that entails integrating datasets\nfrom diverse sources. Despite years of research in this area, it remains a\ntime-consuming and challenging task due to schema mismatches, varying\nterminologies, and differences in data collection methodologies. This paper\npresents the case for agentic data harmonization as a means to both empower\nexperts to harmonize their data and to streamline the process. We introduce\nHarmonia, a system that combines LLM-based reasoning, an interactive user\ninterface, and a library of data harmonization primitives to automate the\nsynthesis of data harmonization pipelines. We demonstrate Harmonia in a\nclinical data harmonization scenario, where it helps to interactively create\nreusable pipelines that map datasets to a standard format. Finally, we discuss\nchallenges and open problems, and suggest research directions for advancing our\nvision.\n","authors":["Aécio Santos","Eduardo H. M. Pena","Roque Lopez","Juliana Freire"],"pdf_url":"https://arxiv.org/pdf/2502.07132v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03724v1","updated":"2025-03-05T18:24:58Z","published":"2025-03-05T18:24:58Z","title":"Deep Causal Behavioral Policy Learning: Applications to Healthcare","summary":" We present a deep learning-based approach to studying dynamic clinical\nbehavioral regimes in diverse non-randomized healthcare settings. Our proposed\nmethodology - deep causal behavioral policy learning (DC-BPL) - uses deep\nlearning algorithms to learn the distribution of high-dimensional clinical\naction paths, and identifies the causal link between these action paths and\npatient outcomes. Specifically, our approach: (1) identifies the causal effects\nof provider assignment on clinical outcomes; (2) learns the distribution of\nclinical actions a given provider would take given evolving patient\ninformation; (3) and combines these steps to identify the optimal provider for\na given patient type and emulate that provider's care decisions. Underlying\nthis strategy, we train a large clinical behavioral model (LCBM) on electronic\nhealth records data using a transformer architecture, and demonstrate its\nability to estimate clinical behavioral policies. We propose a novel\ninterpretation of a behavioral policy learned using the LCBM: that it is an\nefficient encoding of complex, often implicit, knowledge used to treat a\npatient. This allows us to learn a space of policies that are critical to a\nwide range of healthcare applications, in which the vast majority of clinical\nknowledge is acquired tacitly through years of practice and only a tiny\nfraction of information relevant to patient care is written down (e.g. in\ntextbooks, studies or standardized guidelines).\n","authors":["Jonas Knecht","Anna Zink","Jonathan Kolstad","Maya Petersen"],"pdf_url":"https://arxiv.org/pdf/2503.03724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14395v2","updated":"2025-03-05T18:17:28Z","published":"2024-04-22T17:55:56Z","title":"PARAMANU-GANITA: Can Small Math Language Models Rival with Large\n Language Models on Mathematical Reasoning?","summary":" In this paper, we study whether domain specific pretraining of small\ngenerative language models (SLM) from scratch with domain specialized tokenizer\nand Chain-of-Thought (CoT) instruction fine-tuning results in competitive\nperformance on mathematical reasoning compared to LLMs? Secondly, whether this\napproach is environmentally sustainable, highly cost efficient? To address\nthese research questions, we present Paramanu-Ganita, a 208 million-parameter\nnovel decoder-only Auto Regressive SLM on mathematics. We performed pretraining\nfrom scratch on 31.5 billion tokens for 170 A100 hours using a context size of\n4096 on a mixed mathematical corpus consisting of web pages, source code,\ntextbooks, CoT templatised StackOverflow QA pairs, and mathematical lecture\nnotes in LaTeX curated by us. We also trained a math and code specialised BPE\ntokenizer. We proposed and performed CoT instruction fine-tuning of\nParamanu-Ganita on the MetaMathQA dataset. Our model Paramanu-Ganita, despite\nbeing 34 times smaller than the 7B LLMs, outperforms generalist LLMs by\napproximately 30% points, and even math-specialised LLMs by 3-23% points in\nGSM8K test accuracy metric. On MATH benchmark, Paramanu-Ganita outperformed the\nvarious models by 6-8% points. On benchmarks like LogiQA, MMLU (high school,\ncollege level), and competitive exams level, AGIEVAL (AQuA-RAT, SAT-Math),\nParamanu-Ganita outperformed others by 1-4%. Our model is available at\nhttps://huggingface.co/gyanai/paramanu-ganita-208M-hf .\n","authors":["Mitodru Niyogi","Arnab Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2404.14395v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03717v1","updated":"2025-03-05T18:10:11Z","published":"2025-03-05T18:10:11Z","title":"Machine Learning in Biomechanics: Key Applications and Limitations in\n Walking, Running, and Sports Movements","summary":" This chapter provides an overview of recent and promising Machine Learning\napplications, i.e. pose estimation, feature estimation, event detection, data\nexploration & clustering, and automated classification, in gait (walking and\nrunning) and sports biomechanics. It explores the potential of Machine Learning\nmethods to address challenges in biomechanical workflows, highlights central\nlimitations, i.e. data and annotation availability and explainability, that\nneed to be addressed, and emphasises the importance of interdisciplinary\napproaches for fully harnessing the potential of Machine Learning in gait and\nsports biomechanics.\n","authors":["Carlo Dindorf","Fabian Horst","Djordje Slijepčević","Bernhard Dumphart","Jonas Dully","Matthias Zeppelzauer","Brian Horsak","Michael Fröhlich"],"pdf_url":"https://arxiv.org/pdf/2503.03717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03888v3","updated":"2025-03-05T18:04:40Z","published":"2025-01-07T15:51:49Z","title":"Neural DNF-MT: A Neuro-symbolic Approach for Learning Interpretable and\n Editable Policies","summary":" Although deep reinforcement learning has been shown to be effective, the\nmodel's black-box nature presents barriers to direct policy interpretation. To\naddress this problem, we propose a neuro-symbolic approach called neural DNF-MT\nfor end-to-end policy learning. The differentiable nature of the neural DNF-MT\nmodel enables the use of deep actor-critic algorithms for training. At the same\ntime, its architecture is designed so that trained models can be directly\ntranslated into interpretable policies expressed as standard (bivalent or\nprobabilistic) logic programs. Moreover, additional layers can be included to\nextract abstract features from complex observations, acting as a form of\npredicate invention. The logic representations are highly interpretable, and we\nshow how the bivalent representations of deterministic policies can be edited\nand incorporated back into a neural model, facilitating manual intervention and\nadaptation of learned policies. We evaluate our approach on a range of tasks\nrequiring learning deterministic or stochastic behaviours from various forms of\nobservations. Our empirical results show that our neural DNF-MT model performs\nat the level of competing black-box methods whilst providing interpretable\npolicies.\n","authors":["Kexin Gu Baugh","Luke Dickens","Alessandra Russo"],"pdf_url":"https://arxiv.org/pdf/2501.03888v3.pdf","comment":"AAMAS 2025 (with Appendix)"},{"id":"http://arxiv.org/abs/2503.03708v1","updated":"2025-03-05T17:59:19Z","published":"2025-03-05T17:59:19Z","title":"Rethinking Video Tokenization: A Conditioned Diffusion-based Approach","summary":" Video tokenizers, which transform videos into compact latent representations,\nare key to video generation. Existing video tokenizers are based on the VAE\narchitecture and follow a paradigm where an encoder compresses videos into\ncompact latents, and a deterministic decoder reconstructs the original videos\nfrom these latents. In this paper, we propose a novel\n\\underline{\\textbf{C}}onditioned \\underline{\\textbf{D}}iffusion-based video\n\\underline{\\textbf{T}}okenizer entitled \\textbf{\\ourmethod}, which departs from\nprevious methods by replacing the deterministic decoder with a 3D causal\ndiffusion model. The reverse diffusion generative process of the decoder is\nconditioned on the latent representations derived via the encoder. With a\nfeature caching and sampling acceleration, the framework efficiently\nreconstructs high-fidelity videos of arbitrary lengths. Results show that\n{\\ourmethod} achieves state-of-the-art performance in video reconstruction\ntasks using just a single-step sampling. Even a smaller version of {\\ourmethod}\nstill achieves reconstruction results on par with the top two baselines.\nFurthermore, the latent video generation model trained using {\\ourmethod} also\nshows superior performance.\n","authors":["Nianzu Yang","Pandeng Li","Liming Zhao","Yang Li","Chen-Wei Xie","Yehui Tang","Xudong Lu","Zhihang Liu","Yun Zheng","Yu Liu","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2503.03708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03707v1","updated":"2025-03-05T17:58:16Z","published":"2025-03-05T17:58:16Z","title":"Curating Demonstrations using Online Experience","summary":" Many robot demonstration datasets contain heterogeneous demonstrations of\nvarying quality. This heterogeneity may benefit policy pre-training, but can\nhinder robot performance when used with a final imitation learning objective.\nIn particular, some strategies in the data may be less reliable than others or\nmay be underrepresented in the data, leading to poor performance when such\nstrategies are sampled at test time. Moreover, such unreliable or\nunderrepresented strategies can be difficult even for people to discern, and\nsifting through demonstration datasets is time-consuming and costly. On the\nother hand, policy performance when trained on such demonstrations can reflect\nthe reliability of different strategies. We thus propose for robots to\nself-curate based on online robot experience (Demo-SCORE). More specifically,\nwe train and cross-validate a classifier to discern successful policy roll-outs\nfrom unsuccessful ones and use the classifier to filter heterogeneous\ndemonstration datasets. Our experiments in simulation and the real world show\nthat Demo-SCORE can effectively identify suboptimal demonstrations without\nmanual curation. Notably, Demo-SCORE achieves over 15-35% higher absolute\nsuccess rate in the resulting policy compared to the base policy trained with\nall original demonstrations.\n","authors":["Annie S. Chen","Alec M. Lessing","Yuejiang Liu","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2503.03707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.01776v2","updated":"2025-03-05T17:51:09Z","published":"2025-03-03T17:59:48Z","title":"Beyond Matryoshka: Revisiting Sparse Coding for Adaptive Representation","summary":" Many large-scale systems rely on high-quality deep representations\n(embeddings) to facilitate tasks like retrieval, search, and generative\nmodeling. Matryoshka Representation Learning (MRL) recently emerged as a\nsolution for adaptive embedding lengths, but it requires full model retraining\nand suffers from noticeable performance degradations at short lengths. In this\npaper, we show that sparse coding offers a compelling alternative for achieving\nadaptive representation with minimal overhead and higher fidelity. We propose\nContrastive Sparse Representation (CSR), a method that sparsifies pre-trained\nembeddings into a high-dimensional but selectively activated feature space. By\nleveraging lightweight autoencoding and task-aware contrastive objectives, CSR\npreserves semantic quality while allowing flexible, cost-effective inference at\ndifferent sparsity levels. Extensive experiments on image, text, and multimodal\nbenchmarks demonstrate that CSR consistently outperforms MRL in terms of both\naccuracy and retrieval speed-often by large margins-while also cutting training\ntime to a fraction of that required by MRL. Our results establish sparse coding\nas a powerful paradigm for adaptive representation learning in real-world\napplications where efficiency and fidelity are both paramount. Code is\navailable at https://github.com/neilwen987/CSR_Adaptive_Rep\n","authors":["Tiansheng Wen","Yifei Wang","Zequn Zeng","Zhong Peng","Yudi Su","Xinyang Liu","Bo Chen","Hongwei Liu","Stefanie Jegelka","Chenyu You"],"pdf_url":"https://arxiv.org/pdf/2503.01776v2.pdf","comment":"A novel sparse coding framework designed for learning adaptive\n representation"},{"id":"http://arxiv.org/abs/2410.08143v2","updated":"2025-03-05T17:50:44Z","published":"2024-10-10T17:30:09Z","title":"DelTA: An Online Document-Level Translation Agent Based on Multi-Level\n Memory","summary":" Large language models (LLMs) have achieved reasonable quality improvements in\nmachine translation (MT). However, most current research on MT-LLMs still faces\nsignificant challenges in maintaining translation consistency and accuracy when\nprocessing entire documents. In this paper, we introduce DelTA, a\nDocument-levEL Translation Agent designed to overcome these limitations. DelTA\nfeatures a multi-level memory structure that stores information across various\ngranularities and spans, including Proper Noun Records, Bilingual Summary,\nLong-Term Memory, and Short-Term Memory, which are continuously retrieved and\nupdated by auxiliary LLM-based components. Experimental results indicate that\nDelTA significantly outperforms strong baselines in terms of translation\nconsistency and quality across four open/closed-source LLMs and two\nrepresentative document translation datasets, achieving an increase in\nconsistency scores by up to 4.58 percentage points and in COMET scores by up to\n3.16 points on average. DelTA employs a sentence-by-sentence translation\nstrategy, ensuring no sentence omissions and offering a memory-efficient\nsolution compared to the mainstream method. Furthermore, DelTA improves pronoun\nand context-dependent translation accuracy, and the summary component of the\nagent also shows promise as a tool for query-based summarization tasks. The\ncode and data of our approach are released at\nhttps://github.com/YutongWang1216/DocMTAgent.\n","authors":["Yutong Wang","Jiali Zeng","Xuebo Liu","Derek F. Wong","Fandong Meng","Jie Zhou","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.08143v2.pdf","comment":"Accepted as a conference paper at ICLR 2025"},{"id":"http://arxiv.org/abs/2503.03693v1","updated":"2025-03-05T17:43:49Z","published":"2025-03-05T17:43:49Z","title":"ILLC: Iterative Layer-by-Layer Compression for Enhancing Structural\n Faithfulness in SpArX","summary":" In the field of Explainable Artificial Intelligence (XAI), argumentative XAI\napproaches have been proposed to represent the internal reasoning process of\ndeep neural networks in a more transparent way by interpreting hidden nodes as\narguements. However, as the number of layers increases, existing compression\nmethods simplify all layers at once, which lead to high accumulative\ninformation loss. To compensate for this, we propose an iterative\nlayer-by-layer compression technique in which each layer is compressed\nseparately and the reduction error in the next layer is immediately compensated\nfor, thereby improving the overall input-output and structural fidelity of the\nmodel. Experiments on the Breast Cancer Diagnosis dataset show that, compared\nto traditional compression, the method reduces input-output and structural\nunfaithfulness, and maintains a more consistent attack-support relationship in\nthe Argumentative Explanation scheme. This is significant because it provides a\nnew way to make complex MLP models more compact while still conveying their\ninternal inference logic without distortion.\n","authors":["Ungsik Kim"],"pdf_url":"https://arxiv.org/pdf/2503.03693v1.pdf","comment":"8 pages, 2 figures"},{"id":"http://arxiv.org/abs/2411.08932v2","updated":"2025-03-05T17:11:13Z","published":"2024-11-13T03:16:18Z","title":"PyGen: A Collaborative Human-AI Approach to Python Package Creation","summary":" The principles of automation and innovation serve as foundational elements\nfor advancement in contemporary science and technology. Here, we introduce\nPygen, an automation platform designed to empower researchers, technologists,\nand hobbyists to bring abstract ideas to life as core, usable software tools\nwritten in Python. Pygen leverages the immense power of autoregressive large\nlanguage models to augment human creativity during the ideation, iteration, and\ninnovation process. By combining state-of-the-art language models with\nopen-source code generation technologies, Pygen has significantly reduced the\nmanual overhead of tool development. From a user prompt, Pygen automatically\ngenerates Python packages for a complete workflow from concept to package\ngeneration and documentation. The findings of our work show that Pygen\nconsiderably enhances the researcher's productivity by enabling the creation of\nresilient, modular, and well-documented packages for various specialized\npurposes. We employ a prompt enhancement approach to distill the user's package\ndescription into increasingly specific and actionable. While being inherently\nan open-ended task, we have evaluated the generated packages and the\ndocumentation using Human Evaluation, LLM-based evaluation, and CodeBLEU, with\ndetailed results in the results section. Furthermore, we documented our\nresults, analyzed the limitations, and suggested strategies to alleviate them.\nPygen is our vision of ethical automation, a framework that promotes\ninclusivity, accessibility, and collaborative development. This project marks\nthe beginning of a large-scale effort towards creating tools where intelligent\nagents collaborate with humans to improve scientific and technological\ndevelopment substantially.\n Our code and generated examples are open-sourced at\n[https://github.com/GitsSaikat/Pygen]\n","authors":["Saikat Barua","Mostafizur Rahman","Md Jafor Sadek","Rafiul Islam","Shehnaz Khaled","Md. Shohrab Hossain"],"pdf_url":"https://arxiv.org/pdf/2411.08932v2.pdf","comment":"33 pages, 13 figures"},{"id":"http://arxiv.org/abs/2410.17579v3","updated":"2025-03-05T17:09:46Z","published":"2024-10-23T06:08:45Z","title":"Bonsai: Gradient-free Graph Distillation for Node Classification","summary":" Graph distillation has emerged as a promising avenue to enable scalable\ntraining of GNNs by compressing the training dataset while preserving essential\ngraph characteristics. Our study uncovers significant shortcomings in current\ngraph distillation techniques. First, the majority of the algorithms\nparadoxically require training on the full dataset to perform distillation.\nSecond, due to their gradient-emulating approach, these methods require fresh\ndistillation for any change in hyperparameters or GNN architecture, limiting\ntheir flexibility and reusability. Finally, they fail to achieve substantial\nsize reduction due to synthesizing fully-connected, edge-weighted graphs. To\naddress these challenges, we present Bonsai, a novel graph distillation method\nempowered by the observation that \\textit{computation trees} form the\nfundamental processing units of message-passing GNNs. Bonsai distills datasets\nby encoding a careful selection of \\textit{exemplar} trees that maximize the\nrepresentation of all computation trees in the training set. This unique\napproach imparts Bonsai as the first linear-time, model-agnostic graph\ndistillation algorithm for node classification that outperforms existing\nbaselines across $6$ real-world datasets on accuracy, while being $22$ times\nfaster on average. Bonsai is grounded in rigorous mathematical guarantees on\nthe adopted approximation strategies making it robust to GNN architectures,\ndatasets, and parameters.\n","authors":["Mridul Gupta","Samyak Jain","Vansh Ramani","Hariprasad Kodamana","Sayan Ranu"],"pdf_url":"https://arxiv.org/pdf/2410.17579v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03669v1","updated":"2025-03-05T17:03:48Z","published":"2025-03-05T17:03:48Z","title":"Attentive Reasoning Queries: A Systematic Method for Optimizing\n Instruction-Following in Large Language Models","summary":" We present Attentive Reasoning Queries (ARQs), a novel structured reasoning\napproach that significantly improves instruction-following in Large Language\nModels through domain-specialized reasoning blueprints. While LLMs demonstrate\nremarkable capabilities across diverse tasks, they often fail to maintain\nadherence to complex, use-case-specific instructions during multi-turn\nconversations, presenting challenges for business-critical applications. ARQs\naddress this limitation by guiding LLMs through systematic reasoning steps with\ntargeted queries that reinstate critical instructions and facilitate\nintermediate reasoning throughout the completion process. In extensive testing\nwithin Parlant, our framework for reliable customer-facing agents in which ARQs\nwere born out of necessity, they achieved a 90.2% success rate across 87 test\nscenarios, outperforming both Chain-of-Thought reasoning (86.1%) and direct\nresponse generation (81.5%). ARQs showed particular strength in addressing\npersistent failure modes like guideline re-application and hallucination\nprevention. Our analysis also revealed that ARQs can potentially be more\ncomputationally efficient than free-form reasoning when carefully designed.\nThese findings demonstrate that structured reasoning approaches provide\neffective mechanisms for controlling how LLMs process information and make\ndecisions in complex scenarios.\n","authors":["Bar Karov","Dor Zohar","Yam Marcovitz"],"pdf_url":"https://arxiv.org/pdf/2503.03669v1.pdf","comment":"Supplementary materials, including code, is available on our GitHub:\n https://github.com/emcie-co/parlant/tree/arqs-a-systematic-method-for-optimizing-instruction-following-in-llms"},{"id":"http://arxiv.org/abs/2503.03664v1","updated":"2025-03-05T16:54:15Z","published":"2025-03-05T16:54:15Z","title":"A Generative Approach to High Fidelity 3D Reconstruction from Text Data","summary":" The convergence of generative artificial intelligence and advanced computer\nvision technologies introduces a groundbreaking approach to transforming\ntextual descriptions into three-dimensional representations. This research\nproposes a fully automated pipeline that seamlessly integrates text-to-image\ngeneration, various image processing techniques, and deep learning methods for\nreflection removal and 3D reconstruction. By leveraging state-of-the-art\ngenerative models like Stable Diffusion, the methodology translates natural\nlanguage inputs into detailed 3D models through a multi-stage workflow.\n The reconstruction process begins with the generation of high-quality images\nfrom textual prompts, followed by enhancement by a reinforcement learning agent\nand reflection removal using the Stable Delight model. Advanced image upscaling\nand background removal techniques are then applied to further enhance visual\nfidelity. These refined two-dimensional representations are subsequently\ntransformed into volumetric 3D models using sophisticated machine learning\nalgorithms, capturing intricate spatial relationships and geometric\ncharacteristics. This process achieves a highly structured and detailed output,\nensuring that the final 3D models reflect both semantic accuracy and geometric\nprecision.\n This approach addresses key challenges in generative reconstruction, such as\nmaintaining semantic coherence, managing geometric complexity, and preserving\ndetailed visual information. Comprehensive experimental evaluations will assess\nreconstruction quality, semantic accuracy, and geometric fidelity across\ndiverse domains and varying levels of complexity. By demonstrating the\npotential of AI-driven 3D reconstruction techniques, this research offers\nsignificant implications for fields such as augmented reality (AR), virtual\nreality (VR), and digital content creation.\n","authors":["Venkat Kumar R","Deepak Saravanan"],"pdf_url":"https://arxiv.org/pdf/2503.03664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10102v2","updated":"2025-03-05T16:51:06Z","published":"2024-02-15T16:56:25Z","title":"A privacy-preserving, distributed and cooperative FCM-based learning\n approach for cancer research","summary":" Distributed Artificial Intelligence is attracting interest day by day. In\nthis paper, the authors introduce an innovative methodology for distributed\nlearning of Particle Swarm Optimization-based Fuzzy Cognitive Maps in a\nprivacy-preserving way. The authors design a training scheme for collaborative\nFCM learning that offers data privacy compliant with the current regulation.\nThis method is applied to a cancer detection problem, proving that the\nperformance of the model is improved by the Federated Learning process, and\nobtaining similar results to the ones that can be found in the literature.\n","authors":["Jose L. Salmeron","Irina Arévalo"],"pdf_url":"https://arxiv.org/pdf/2402.10102v2.pdf","comment":"Rough Sets: International Joint Conference, IJCRS 2020"},{"id":"http://arxiv.org/abs/2410.16024v2","updated":"2025-03-05T16:49:51Z","published":"2024-10-21T13:58:38Z","title":"SMAC-R1: The Emergence of Intelligence in Decision-Making Tasks","summary":" StarCraft Multi-Agent Challenge (SMAC) has been one of the most commonly used\nexperimental environments in multi-agent reinforcement learning (MARL), where\nthe specific task is to control a set number of allied units to defeat enemy\nforces. Traditional MARL algorithms often require interacting with the\nenvironment for millions of steps to train a parametric model, of which the\nresulting policies are typically non-interpretable with weak transferability.\nIn this paper, we introduce SMAC-R1 which is based on the Qwen2.5-7B-Base LLM\ndistilled from DeepSeek-Coder-v2.5-236B. Similar to online reinforcement\nlearning after behavior cloning in offline learning process, in our pipeline,\nagents leverage the DeepSeek LLM to generate decision tree code by providing\ntask descriptions, and the agents are further self-reflected using feedback\nfrom the rewards provided by the environment. Based on that, we augment the\ngenerated scripts to fine-tune a small LLM, Qwen2.5-7B-Base, to distill the\ndecision-making ability via Supervised Fine-Tuning (SFT) and enhance the script\ngeneration ability by the Group Relative Policy Optimization (GRPO) algorithm.\nWe conduct experiments in the original 23 SMAC tasks and 10 newly-designed\ntasks to demonstrate that our method can produce high-quality, interpretable\ndecision trees with minimal environmental exploration. Moreover, these scripts\nexhibit strong transferability, successfully applying to homogeneous SMAC\nenvironments without modification. We believe this approach offers a new\ndirection for solving decision-making tasks and domain-specific LLM training\npipelines in the future.\n","authors":["Yue Deng","Weiyu Ma","Yuxin Fan","Ruyi Song","Yin Zhang","Haifeng Zhang","Jian Zhao"],"pdf_url":"https://arxiv.org/pdf/2410.16024v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07402v2","updated":"2025-03-05T16:48:23Z","published":"2024-09-11T16:42:22Z","title":"What to align in multimodal contrastive learning?","summary":" Humans perceive the world through multisensory integration, blending the\ninformation of different modalities to adapt their behavior. Contrastive\nlearning offers an appealing solution for multimodal self-supervised learning.\nIndeed, by considering each modality as a different view of the same entity, it\nlearns to align features of different modalities in a shared representation\nspace. However, this approach is intrinsically limited as it only learns shared\nor redundant information between modalities, while multimodal interactions can\narise in other ways. In this work, we introduce CoMM, a Contrastive MultiModal\nlearning strategy that enables the communication between modalities in a single\nmultimodal space. Instead of imposing cross- or intra- modality constraints, we\npropose to align multimodal representations by maximizing the mutual\ninformation between augmented versions of these multimodal features. Our\ntheoretical analysis shows that shared, synergistic and unique terms of\ninformation naturally emerge from this formulation, allowing us to estimate\nmultimodal interactions beyond redundancy. We test CoMM both in a controlled\nand in a series of real-world settings: in the former, we demonstrate that CoMM\neffectively captures redundant, unique and synergistic information between\nmodalities. In the latter, CoMM learns complex multimodal interactions and\nachieves state-of-the-art results on the seven multimodal benchmarks. Code is\navailable at https://github.com/Duplums/CoMM\n","authors":["Benoit Dufumier","Javiera Castillo-Navarro","Devis Tuia","Jean-Philippe Thiran"],"pdf_url":"https://arxiv.org/pdf/2409.07402v2.pdf","comment":"ICLR 2025, 25 pages"},{"id":"http://arxiv.org/abs/2411.00816v2","updated":"2025-03-05T16:36:05Z","published":"2024-10-28T08:10:21Z","title":"CycleResearcher: Improving Automated Research via Automated Review","summary":" The automation of scientific discovery has been a long-standing goal within\nthe research community, driven by the potential to accelerate knowledge\ncreation. While significant progress has been made using commercial large\nlanguage models (LLMs) as research assistants or idea generators, the\npossibility of automating the entire research process with open-source LLMs\nremains largely unexplored. This paper explores the feasibility of using\nopen-source post-trained LLMs as autonomous agents capable of performing the\nfull cycle of automated research and review, from literature review and\nmanuscript preparation to peer review and paper refinement. Our iterative\npreference training framework consists of CycleResearcher, which conducts\nresearch tasks, and CycleReviewer, which simulates the peer review process,\nproviding iterative feedback via reinforcement learning. To train these models,\nwe develop two new datasets, Review-5k and Research-14k, reflecting real-world\nmachine learning research and peer review dynamics. Our results demonstrate\nthat CycleReviewer achieves promising performance with a 26.89\\% reduction in\nmean absolute error (MAE) compared to individual human reviewers in predicting\npaper scores, indicating the potential of LLMs to effectively assist\nexpert-level research evaluation. In research, the papers generated by the\nCycleResearcher model achieved a score of 5.36 in simulated peer reviews,\nshowing some competitiveness in terms of simulated review scores compared to\nthe preprint level of 5.24 from human experts, while still having room for\nimprovement compared to the accepted paper level of 5.69. This work represents\na significant step toward fully automated scientific inquiry, providing ethical\nsafeguards and exploring AI-driven research capabilities. The code, dataset and\nmodel weight are released at https://wengsyx.github.io/Researcher/\n","authors":["Yixuan Weng","Minjun Zhu","Guangsheng Bao","Hongbo Zhang","Jindong Wang","Yue Zhang","Linyi Yang"],"pdf_url":"https://arxiv.org/pdf/2411.00816v2.pdf","comment":"Accept in ICLR 2025"},{"id":"http://arxiv.org/abs/2503.03655v1","updated":"2025-03-05T16:35:15Z","published":"2025-03-05T16:35:15Z","title":"Improving 6D Object Pose Estimation of metallic Household and Industry\n Objects","summary":" 6D object pose estimation suffers from reduced accuracy when applied to\nmetallic objects. We set out to improve the state-of-the-art by addressing\nchallenges such as reflections and specular highlights in industrial\napplications. Our novel BOP-compatible dataset, featuring a diverse set of\nmetallic objects (cans, household, and industrial items) under various lighting\nand background conditions, provides additional geometric and visual cues. We\ndemonstrate that these cues can be effectively leveraged to enhance overall\nperformance. To illustrate the usefulness of the additional features, we\nimprove upon the GDRNPP algorithm by introducing an additional keypoint\nprediction and material estimator head in order to improve spatial scene\nunderstanding. Evaluations on the new dataset show improved accuracy for\nmetallic objects, supporting the hypothesis that additional geometric and\nvisual cues can improve learning.\n","authors":["Thomas Pöllabauer","Michael Gasser","Tristan Wirth","Sarah Berkei","Volker Knauthe","Arjan Kuijper"],"pdf_url":"https://arxiv.org/pdf/2503.03655v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03654v1","updated":"2025-03-05T16:32:47Z","published":"2025-03-05T16:32:47Z","title":"Improving Neutral Point of View Text Generation through\n Parameter-Efficient Reinforcement Learning and a Small-Scale High-Quality\n Dataset","summary":" This paper describes the construction of a dataset and the evaluation of\ntraining methods to improve generative large language models' (LLMs) ability to\nanswer queries on sensitive topics with a Neutral Point of View (NPOV), i.e.,\nto provide significantly more informative, diverse and impartial answers. The\ndataset, the SHQ-NPOV dataset, comprises 300 high-quality, human-written\nquadruplets: a query on a sensitive topic, an answer, an NPOV rating, and a set\nof links to source texts elaborating the various points of view. The first key\ncontribution of this paper is a new methodology to create such datasets through\niterative rounds of human peer-critique and annotator training, which we\nrelease alongside the dataset. The second key contribution is the\nidentification of a highly effective training regime for parameter-efficient\nreinforcement learning (PE-RL) to improve NPOV generation. We compare and\nextensively evaluate PE-RL and multiple baselines-including LoRA finetuning (a\nstrong baseline), SFT and RLHF.\n PE-RL not only improves on overall NPOV quality compared to the strongest\nbaseline ($97.06\\%\\rightarrow 99.08\\%$), but also scores much higher on\nfeatures linguists identify as key to separating good answers from the best\nanswers ($60.25\\%\\rightarrow 85.21\\%$ for presence of supportive details,\n$68.74\\%\\rightarrow 91.43\\%$ for absence of oversimplification). A qualitative\nanalysis corroborates this. Finally, our evaluation finds no statistical\ndifferences between results on topics that appear in the training dataset and\nthose on separated evaluation topics, which provides strong evidence that our\napproach to training PE-RL exhibits very effective out of topic generalization.\n","authors":["Jessica Hoffmann","Christiane Ahlheim","Zac Yu","Aria Walfrand","Jarvis Jin","Marie Tano","Ahmad Beirami","Erin van Liemt","Nithum Thain","Hakim Sidahmed","Lucas Dixon"],"pdf_url":"https://arxiv.org/pdf/2503.03654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.20900v2","updated":"2025-03-05T16:23:09Z","published":"2025-02-28T09:57:20Z","title":"DexGraspVLA: A Vision-Language-Action Framework Towards General\n Dexterous Grasping","summary":" Dexterous grasping remains a fundamental yet challenging problem in robotics.\nA general-purpose robot must be capable of grasping diverse objects in\narbitrary scenarios. However, existing research typically relies on specific\nassumptions, such as single-object settings or limited environments, leading to\nconstrained generalization. Our solution is DexGraspVLA, a hierarchical\nframework that utilizes a pre-trained Vision-Language model as the high-level\ntask planner and learns a diffusion-based policy as the low-level Action\ncontroller. The key insight lies in iteratively transforming diverse language\nand visual inputs into domain-invariant representations, where imitation\nlearning can be effectively applied due to the alleviation of domain shift.\nThus, it enables robust generalization across a wide range of real-world\nscenarios. Notably, our method achieves a 90+% success rate under thousands of\nunseen object, lighting, and background combinations in a ``zero-shot''\nenvironment. Empirical analysis further confirms the consistency of internal\nmodel behavior across environmental variations, thereby validating our design\nand explaining its generalization performance. We hope our work can be a step\nforward in achieving general dexterous grasping. Our demo and code can be found\nat https://dexgraspvla.github.io/.\n","authors":["Yifan Zhong","Xuchuan Huang","Ruochong Li","Ceyao Zhang","Yitao Liang","Yaodong Yang","Yuanpei Chen"],"pdf_url":"https://arxiv.org/pdf/2502.20900v2.pdf","comment":"21 pages, 10 figures"},{"id":"http://arxiv.org/abs/2501.17755v2","updated":"2025-03-05T16:20:03Z","published":"2025-01-29T16:48:13Z","title":"AI Governance through Markets","summary":" This paper argues that market governance mechanisms should be considered a\nkey approach in the governance of artificial intelligence (AI), alongside\ntraditional regulatory frameworks. While current governance approaches have\npredominantly focused on regulation, we contend that market-based mechanisms\noffer effective incentives for responsible AI development. We examine four\nemerging vectors of market governance: insurance, auditing, procurement, and\ndue diligence, demonstrating how these mechanisms can affirm the relationship\nbetween AI risk and financial risk while addressing capital allocation\ninefficiencies. While we do not claim that market forces alone can adequately\nprotect societal interests, we maintain that standardised AI disclosures and\nmarket mechanisms can create powerful incentives for safe and responsible AI\ndevelopment. This paper urges regulators, economists, and machine learning\nresearchers to investigate and implement market-based approaches to AI\ngovernance.\n","authors":["Philip Moreira Tomei","Rupal Jain","Matija Franklin"],"pdf_url":"https://arxiv.org/pdf/2501.17755v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.02102v2","updated":"2025-03-05T16:18:33Z","published":"2025-03-03T22:37:03Z","title":"Provable Benefits of Task-Specific Prompts for In-context Learning","summary":" The in-context learning capabilities of modern language models have motivated\na deeper mathematical understanding of sequence models. A line of recent work\nhas shown that linear attention models can emulate projected gradient descent\niterations to implicitly learn the task vector from the data provided in the\ncontext window. In this work, we consider a novel setting where the global task\ndistribution can be partitioned into a union of conditional task distributions.\nWe then examine the use of task-specific prompts and prediction heads for\nlearning the prior information associated with the conditional task\ndistribution using a one-layer attention model. Our results on loss landscape\nshow that task-specific prompts facilitate a covariance-mean decoupling where\nprompt-tuning explains the conditional mean of the distribution whereas the\nvariance is learned/explained through in-context learning. Incorporating\ntask-specific head further aids this process by entirely decoupling estimation\nof mean and variance components. This covariance-mean perspective similarly\nexplains how jointly training prompt and attention weights can provably help\nover fine-tuning after pretraining.\n","authors":["Xiangyu Chang","Yingcong Li","Muti Kara","Samet Oymak","Amit K. Roy-Chowdhury"],"pdf_url":"https://arxiv.org/pdf/2503.02102v2.pdf","comment":"Proceedings of the 28th International Conference on Artificial\n Intelligence and Statistics (AISTATS) 2025"},{"id":"http://arxiv.org/abs/2410.12893v2","updated":"2025-03-05T16:16:01Z","published":"2024-10-16T12:24:42Z","title":"MIRROR: A Novel Approach for the Automated Evaluation of Open-Ended\n Question Generation","summary":" Automatic question generation is a critical task that involves evaluating\nquestion quality by considering factors such as engagement, pedagogical value,\nand the ability to stimulate critical thinking. These aspects require\nhuman-like understanding and judgment, which automated systems currently lack.\nHowever, human evaluations are costly and impractical for large-scale samples\nof generated questions. Therefore, we propose a novel system, MIRROR (Multi-LLM\nIterative Review and Response for Optimized Rating), which leverages large\nlanguage models (LLMs) to automate the evaluation process for questions\ngenerated by automated question generation systems. We experimented with\nseveral state-of-the-art LLMs, such as GPT-4, Gemini, and Llama2-70b. We\nobserved that the scores of human evaluation metrics, namely relevance,\nappropriateness, novelty, complexity, and grammaticality, improved when using\nthe feedback-based approach called MIRROR, tending to be closer to the human\nbaseline scores. Furthermore, we observed that Pearson's correlation\ncoefficient between GPT-4 and human experts improved when using our proposed\nfeedback-based approach, MIRROR, compared to direct prompting for evaluation.\nError analysis shows that our proposed approach, MIRROR, significantly helps to\nimprove relevance and appropriateness.\n","authors":["Aniket Deroy","Subhankar Maity","Sudeshna Sarkar"],"pdf_url":"https://arxiv.org/pdf/2410.12893v2.pdf","comment":"NeurIPS'24 Workshop on Large Foundation Models for Educational\n Assessment (FM-EduAssess)"},{"id":"http://arxiv.org/abs/2409.06615v5","updated":"2025-03-05T16:07:20Z","published":"2024-09-10T16:11:57Z","title":"One-Shot Imitation under Mismatched Execution","summary":" Human demonstrations as prompts are a powerful way to program robots to do\nlong-horizon manipulation tasks. However, translating these demonstrations into\nrobot-executable actions presents significant challenges due to execution\nmismatches in movement styles and physical capabilities. Existing methods\neither depend on human-robot paired data, which is infeasible to scale, or rely\nheavily on frame-level visual similarities that often break down in practice.\nTo address these challenges, we propose RHyME, a novel framework that\nautomatically aligns human and robot task executions using optimal transport\ncosts. Given long-horizon robot demonstrations, RHyME synthesizes semantically\nequivalent human videos by retrieving and composing short-horizon human clips.\nThis approach facilitates effective policy training without the need for paired\ndata. RHyME successfully imitates a range of cross-embodiment demonstrators,\nboth in simulation and with a real human hand, achieving over 50\\% increase in\ntask success compared to previous methods. We release our code and datasets at\nhttps://portal-cornell.github.io/rhyme/.\n","authors":["Kushal Kedia","Prithwish Dan","Angela Chao","Maximus Adrian Pace","Sanjiban Choudhury"],"pdf_url":"https://arxiv.org/pdf/2409.06615v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.21028v2","updated":"2025-03-05T15:52:43Z","published":"2025-02-28T13:16:34Z","title":"Measuring and identifying factors of individuals' trust in Large\n Language Models","summary":" Large Language Models (LLMs) can engage in human-looking conversational\nexchanges. Although conversations can elicit trust between users and LLMs,\nscarce empirical research has examined trust formation in human-LLM contexts,\nbeyond LLMs' trustworthiness or human trust in AI in general. Here, we\nintroduce the Trust-In-LLMs Index (TILLMI) as a new framework to measure\nindividuals' trust in LLMs, extending McAllister's cognitive and affective\ntrust dimensions to LLM-human interactions. We developed TILLMI as a\npsychometric scale, prototyped with a novel protocol we called LLM-simulated\nvalidity. The LLM-based scale was then validated in a sample of 1,000 US\nrespondents. Exploratory Factor Analysis identified a two-factor structure. Two\nitems were then removed due to redundancy, yielding a final 6-item scale with a\n2-factor structure. Confirmatory Factor Analysis on a separate subsample showed\nstrong model fit ($CFI = .995$, $TLI = .991$, $RMSEA = .046$, $p_{X^2} > .05$).\nConvergent validity analysis revealed that trust in LLMs correlated positively\nwith openness to experience, extraversion, and cognitive flexibility, but\nnegatively with neuroticism. Based on these findings, we interpreted TILLMI's\nfactors as \"closeness with LLMs\" (affective dimension) and \"reliance on LLMs\"\n(cognitive dimension). Younger males exhibited higher closeness with- and\nreliance on LLMs compared to older women. Individuals with no direct experience\nwith LLMs exhibited lower levels of trust compared to LLMs' users. These\nfindings offer a novel empirical foundation for measuring trust in AI-driven\nverbal communication, informing responsible design, and fostering balanced\nhuman-AI collaboration.\n","authors":["Edoardo Sebastiano De Duro","Giuseppe Alessandro Veltri","Hudson Golino","Massimo Stella"],"pdf_url":"https://arxiv.org/pdf/2502.21028v2.pdf","comment":"23 pages, 6 figures"},{"id":"http://arxiv.org/abs/2503.03606v1","updated":"2025-03-05T15:42:37Z","published":"2025-03-05T15:42:37Z","title":"Decoupled Recommender Systems: Exploring Alternative Recommender\n Ecosystem Designs","summary":" Recommender ecosystems are an emerging subject of research. Such research\nexamines how the characteristics of algorithms, recommendation consumers, and\nitem providers influence system dynamics and long-term outcomes. One\narchitectural possibility that has not yet been widely explored in this line of\nresearch is the consequences of a configuration in which recommendation\nalgorithms are decoupled from the platforms they serve. This is sometimes\ncalled \"the friendly neighborhood algorithm store\" or \"middleware\" model. We\nare particularly interested in how such architectures might offer a range of\ndifferent distributions of utility across consumers, providers, and\nrecommendation platforms. In this paper, we create a model of a recommendation\necosystem that incorporates algorithm choice and examine the outcomes of such a\ndesign.\n","authors":["Anas Buhayh","Elizabeth McKinnie","Robin Burke"],"pdf_url":"https://arxiv.org/pdf/2503.03606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03595v1","updated":"2025-03-05T15:28:50Z","published":"2025-03-05T15:28:50Z","title":"Towards Understanding Text Hallucination of Diffusion Models via Local\n Generation Bias","summary":" Score-based diffusion models have achieved incredible performance in\ngenerating realistic images, audio, and video data. While these models produce\nhigh-quality samples with impressive details, they often introduce unrealistic\nartifacts, such as distorted fingers or hallucinated texts with no meaning.\nThis paper focuses on textual hallucinations, where diffusion models correctly\ngenerate individual symbols but assemble them in a nonsensical manner. Through\nexperimental probing, we consistently observe that such phenomenon is\nattributed it to the network's local generation bias. Denoising networks tend\nto produce outputs that rely heavily on highly correlated local regions,\nparticularly when different dimensions of the data distribution are nearly\npairwise independent. This behavior leads to a generation process that\ndecomposes the global distribution into separate, independent distributions for\neach symbol, ultimately failing to capture the global structure, including\nunderlying grammar. Intriguingly, this bias persists across various denoising\nnetwork architectures including MLP and transformers which have the structure\nto model global dependency. These findings also provide insights into\nunderstanding other types of hallucinations, extending beyond text, as a result\nof implicit biases in the denoising models. Additionally, we theoretically\nanalyze the training dynamics for a specific case involving a two-layer MLP\nlearning parity points on a hypercube, offering an explanation of its\nunderlying mechanism.\n","authors":["Rui Lu","Runzhe Wang","Kaifeng Lyu","Xitai Jiang","Gao Huang","Mengdi Wang"],"pdf_url":"https://arxiv.org/pdf/2503.03595v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03594v1","updated":"2025-03-05T15:27:36Z","published":"2025-03-05T15:27:36Z","title":"Small but Mighty: Enhancing Time Series Forecasting with Lightweight\n LLMs","summary":" While LLMs have demonstrated remarkable potential in time series forecasting,\ntheir practical deployment remains constrained by excessive computational\ndemands and memory footprints. Existing LLM-based approaches typically suffer\nfrom three critical limitations: Inefficient parameter utilization in handling\nnumerical time series patterns; Modality misalignment between continuous\ntemporal signals and discrete text embeddings; and Inflexibility for real-time\nexpert knowledge integration. We present SMETimes, the first systematic\ninvestigation of sub-3B parameter SLMs for efficient and accurate time series\nforecasting. Our approach centers on three key innovations: A\nstatistically-enhanced prompting mechanism that bridges numerical time series\nwith textual semantics through descriptive statistical features; A adaptive\nfusion embedding architecture that aligns temporal patterns with language model\ntoken spaces through learnable parameters; And a dynamic mixture-of-experts\nframework enabled by SLMs' computational efficiency, adaptively combining base\npredictions with domain-specific models. Extensive evaluations across seven\nbenchmark datasets demonstrate that our 3B-parameter SLM achieves\nstate-of-the-art performance on five primary datasets while maintaining 3.8x\nfaster training and 5.2x lower memory consumption compared to 7B-parameter LLM\nbaselines. Notably, the proposed model exhibits better learning capabilities,\nachieving 12.3% lower MSE than conventional LLM. Ablation studies validate that\nour statistical prompting and cross-modal fusion modules respectively\ncontribute 15.7% and 18.2% error reduction in long-horizon forecasting tasks.\nBy redefining the efficiency-accuracy trade-off landscape, this work\nestablishes SLMs as viable alternatives to resource-intensive LLMs for\npractical time series forecasting. Code and models are available at\nhttps://github.com/xiyan1234567/SMETimes.\n","authors":["Haoran Fan","Bin Li","Yixuan Weng","Shoujun Zhou"],"pdf_url":"https://arxiv.org/pdf/2503.03594v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2503.03592v1","updated":"2025-03-05T15:26:59Z","published":"2025-03-05T15:26:59Z","title":"English K_Quantization of LLMs Does Not Disproportionately Diminish\n Multilingual Performance","summary":" For consumer usage of locally deployed LLMs, the GGUF format and\nk_quantization are invaluable tools for maintaining the performance of the\noriginal model while reducing it to sizes deployable with consumer-grade\nhardware. The number of bits dedicated to each weight from the original model\nis reduced based on how important they are thought to be during model\ninference. This importance is arrived at through the application of an\n'importance matrix'-a relatively small text document meant to be representative\nof the LLM's standard use-cases. In the vast majority of quants available\nonline, this document is primarily written in English. It was therefore an open\nquestion whether performance on English language tasks was preserved through\nthe sacrifice of multilingual performance and whether it can be preserved with\nalternate importance matrices. This article investigates these hypotheses by\nquantizing Llama3.3 70B on importance matrices written in three languages\n(English, Norwegian, and Malayalam) and evaluating them on the MixEval dataset\nin both English and Norwegian. All experiments related to k_quantization\nyielded non-significant results (In all cases p > 0.237) indicating that\ncurrent quantization practices do not disproportionately harm multilingual\nperformance.\n","authors":["Karl Audun Borgersen"],"pdf_url":"https://arxiv.org/pdf/2503.03592v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.16207v3","updated":"2025-03-05T15:26:49Z","published":"2025-01-27T17:00:56Z","title":"From Informal to Formal -- Incorporating and Evaluating LLMs on Natural\n Language Requirements to Verifiable Formal Proofs","summary":" The research in AI-based formal mathematical reasoning has shown an\nunstoppable growth trend. These studies have excelled in mathematical\ncompetitions like IMO and have made significant progress. This paper focuses on\nformal verification, an immediate application scenario of formal reasoning, and\nbreaks it down into sub-tasks. We constructed 18k high-quality\ninstruction-response pairs across five formal specification languages (Coq,\nLean4, Dafny, ACSL, and TLA+) by distilling gpt-4o and evaluated against ten\nopen-sourced LLMs, including recent popular DeepSeek-R1. We also fine-tuned\nseveral 7~8B small models to achieve comparable performance with\nDeepseek-R1-671B. Interestingly, we observed that fine-tuning with formal data\nalso enhances mathematics, reasoning, and coding capabilities. Fine-tuned\nmodels are released at https: //huggingface.co/fm-universe.\n","authors":["Jialun Cao","Yaojie Lu","Meiziniu Li","Haoyang Ma","Haokun Li","Mengda He","Cheng Wen","Le Sun","Hongyu Zhang","Shengchao Qin","Shing-Chi Cheung","Cong Tian"],"pdf_url":"https://arxiv.org/pdf/2501.16207v3.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2501.01999v2","updated":"2025-03-05T15:26:17Z","published":"2025-01-01T07:00:41Z","title":"On the Utility of Equivariance and Symmetry Breaking in Deep Learning\n Architectures on Point Clouds","summary":" This paper explores the key factors that influence the performance of models\nworking with point clouds, across different tasks of varying geometric\ncomplexity. In this work, we explore the trade-offs between flexibility and\nweight-sharing introduced by equivariant layers, assessing when equivariance\nboosts or detracts from performance. It is often argued that providing more\ninformation as input improves a model's performance. However, if this\nadditional information breaks certain properties, such as $\\SE(3)$\nequivariance, does it remain beneficial? We identify the key aspects of\nequivariant and non-equivariant architectures that drive success in different\ntasks by benchmarking them on segmentation, regression, and generation tasks\nacross multiple datasets with increasing complexity. We observe a positive\nimpact of equivariance, which becomes more pronounced with increasing task\ncomplexity, even when strict equivariance is not required.\n","authors":["Sharvaree Vadgama","Mohammad Mohaiminul Islam","Domas Buracus","Christian Shewmake","Erik Bekkers"],"pdf_url":"https://arxiv.org/pdf/2501.01999v2.pdf","comment":"19 pages, 4 figures"},{"id":"http://arxiv.org/abs/2503.02623v2","updated":"2025-03-05T15:23:16Z","published":"2025-03-04T13:48:50Z","title":"Rewarding Doubt: A Reinforcement Learning Approach to Confidence\n Calibration of Large Language Models","summary":" A safe and trustworthy use of Large Language Models (LLMs) requires an\naccurate expression of confidence in their answers. We introduce a novel\nReinforcement Learning (RL) approach for LLM calibration that fine-tunes LLMs\nto elicit calibrated confidence estimations in their answers to factual\nquestions. We model the problem as a betting game where the model predicts a\nconfidence score together with every answer, and design a reward function that\npenalizes both over and under-confidence. We prove that under our reward design\nan optimal policy would result in a perfectly calibrated confidence estimation.\nOur experiments demonstrate significantly improved confidence calibration and\ngeneralization to new tasks without re-training, indicating that our approach\nteaches a general confidence awareness. This approach enables the training of\ninherently calibrated LLMs.\n","authors":["Paul Stangel","David Bani-Harouni","Chantal Pellegrini","Ege Özsoy","Kamilia Zaripova","Matthias Keicher","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2503.02623v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.20427v2","updated":"2025-03-05T14:58:33Z","published":"2025-02-27T12:26:25Z","title":"DeePen: Penetration Testing for Audio Deepfake Detection","summary":" Deepfakes - manipulated or forged audio and video media - pose significant\nsecurity risks to individuals, organizations, and society at large. To address\nthese challenges, machine learning-based classifiers are commonly employed to\ndetect deepfake content. In this paper, we assess the robustness of such\nclassifiers through a systematic penetration testing methodology, which we\nintroduce as DeePen. Our approach operates without prior knowledge of or access\nto the target deepfake detection models. Instead, it leverages a set of\ncarefully selected signal processing modifications - referred to as attacks -\nto evaluate model vulnerabilities. Using DeePen, we analyze both real-world\nproduction systems and publicly available academic model checkpoints,\ndemonstrating that all tested systems exhibit weaknesses and can be reliably\ndeceived by simple manipulations such as time-stretching or echo addition.\nFurthermore, our findings reveal that while some attacks can be mitigated by\nretraining detection systems with knowledge of the specific attack, others\nremain persistently effective. We release all associated code.\n","authors":["Nicolas Müller","Piotr Kawa","Adriana Stan","Thien-Phuc Doan","Souhwan Jung","Wei Herng Choong","Philip Sperl","Konstantin Böttinger"],"pdf_url":"https://arxiv.org/pdf/2502.20427v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11613v2","updated":"2025-03-05T14:55:49Z","published":"2024-07-16T11:22:34Z","title":"Bringing AI Participation Down to Scale: A Comment on Open AIs\n Democratic Inputs to AI Project","summary":" In 2023, Open AIs Democratic Inputs program funded 10 teams to design\nprocedures for public participation in generative AI. In this Perspective, we\nreview the results of the project, drawing on interviews with some of the teams\nand our own experiences conducting participation exercises, we identify several\nshared yet largely unspoken assumptions of the Democratic Inputs program 1.\nthat participation must be scalable 2. that the object of participation is a\nsingle model 3. that there must be a single form of participation 4. that the\ngoal is to extract abstract principles 5. that these principles should have\nconsensus 6. that publics should be representative and encourage alternative\nforms of participation in AI, perhaps not undertaken by tech companies.\n","authors":["David Moats","Chandrima Ganguly"],"pdf_url":"https://arxiv.org/pdf/2407.11613v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03563v1","updated":"2025-03-05T14:51:46Z","published":"2025-03-05T14:51:46Z","title":"A Conceptual Model for Attributions in Event-Centric Knowledge Graphs","summary":" The use of narratives as a means of fusing information from knowledge graphs\n(KGs) into a coherent line of argumentation has been the subject of recent\ninvestigation. Narratives are especially useful in event-centric knowledge\ngraphs in that they provide a means to connect different real-world events and\ncategorize them by well-known narrations. However, specifically for\ncontroversial events, a problem in information fusion arises, namely, multiple\nviewpoints regarding the validity of certain event aspects, e.g., regarding the\nrole a participant takes in an event, may exist. Expressing those viewpoints in\nKGs is challenging because disputed information provided by different\nviewpoints may introduce inconsistencies. Hence, most KGs only feature a single\nview on the contained information, hampering the effectiveness of narrative\ninformation access. This paper is an extension of our original work and\nintroduces attributions, i.e., parameterized predicates that allow for the\nrepresentation of facts that are only valid in a specific viewpoint. For this,\nwe develop a conceptual model that allows for the representation of\nviewpoint-dependent information. As an extension, we enhance the model by a\nconception of viewpoint-compatibility. Based on this, we deepen our original\ndeliberations on the model's effects on information fusion and provide\nadditional grounding in the literature.\n","authors":["Florian Plötzky","Katarina Britz","Wolf-Tilo Balke"],"pdf_url":"https://arxiv.org/pdf/2503.03563v1.pdf","comment":"Submitted to Data & Knowledge Engineering, 22 pages, 9 figures"},{"id":"http://arxiv.org/abs/2503.03562v1","updated":"2025-03-05T14:49:08Z","published":"2025-03-05T14:49:08Z","title":"Towards Visual Discrimination and Reasoning of Real-World Physical\n Dynamics: Physics-Grounded Anomaly Detection","summary":" Humans detect real-world object anomalies by perceiving, interacting, and\nreasoning based on object-conditioned physical knowledge. The long-term goal of\nIndustrial Anomaly Detection (IAD) is to enable machines to autonomously\nreplicate this skill. However, current IAD algorithms are largely developed and\ntested on static, semantically simple datasets, which diverge from real-world\nscenarios where physical understanding and reasoning are essential.To bridge\nthis gap, we introduce the Physics Anomaly Detection (Phys-AD) dataset, the\nfirst large-scale, real-world, physics-grounded video dataset for industrial\nanomaly detection. Collected using a real robot arm and motor, Phys-AD provides\na diverse set of dynamic, semantically rich scenarios. The dataset includes\nmore than 6400 videos across 22 real-world object categories, interacting with\nrobot arms and motors, and exhibits 47 types of anomalies. Anomaly detection in\nPhys-AD requires visual reasoning, combining both physical knowledge and video\ncontent to determine object abnormality.We benchmark state-of-the-art anomaly\ndetection methods under three settings: unsupervised AD, weakly-supervised AD,\nand video-understanding AD, highlighting their limitations in handling\nphysics-grounded anomalies. Additionally, we introduce the Physics Anomaly\nExplanation (PAEval) metric, designed to assess the ability of visual-language\nfoundation models to not only detect anomalies but also provide accurate\nexplanations for their underlying physical causes. Our dataset and benchmark\nwill be publicly available.\n","authors":["Wenqiao Li","Yao Gu","Xintao Chen","Xiaohao Xu","Ming Hu","Xiaonan Huang","Yingna Wu"],"pdf_url":"https://arxiv.org/pdf/2503.03562v1.pdf","comment":"Accepted by CVPR 2025"},{"id":"http://arxiv.org/abs/2411.13982v2","updated":"2025-03-05T14:45:55Z","published":"2024-11-21T09:47:13Z","title":"Safety Without Semantic Disruptions: Editing-free Safe Image Generation\n via Context-preserving Dual Latent Reconstruction","summary":" Training multimodal generative models on large, uncurated datasets can result\nin users being exposed to harmful, unsafe and controversial or\nculturally-inappropriate outputs. While model editing has been proposed to\nremove or filter undesirable concepts in embedding and latent spaces, it can\ninadvertently damage learned manifolds, distorting concepts in close semantic\nproximity. We identify limitations in current model editing techniques, showing\nthat even benign, proximal concepts may become misaligned. To address the need\nfor safe content generation, we leverage safe embeddings and a modified\ndiffusion process with tunable weighted summation in the latent space to\ngenerate safer images. Our method preserves global context without compromising\nthe structural integrity of the learned manifolds. We achieve state-of-the-art\nresults on safe image generation benchmarks and offer intuitive control over\nthe level of model safety. We identify trade-offs between safety and\ncensorship, which presents a necessary perspective in the development of\nethical AI models. We will release our code.\n Keywords: Text-to-Image Models, Generative AI, Safety, Reliability, Model\nEditing\n","authors":["Jordan Vice","Naveed Akhtar","Mubarak Shah","Richard Hartley","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2411.13982v2.pdf","comment":"This research is supported by the NISDRG project #20100007, funded by\n the Australian Government"},{"id":"http://arxiv.org/abs/2407.16205v5","updated":"2025-03-05T14:43:33Z","published":"2024-07-23T06:14:41Z","title":"LLMs can be Dangerous Reasoners: Analyzing-based Jailbreak Attack on\n Large Language Models","summary":" The rapid development of Large Language Models (LLMs) has brought significant\nadvancements across various tasks. However, despite these achievements, LLMs\nstill exhibit inherent safety vulnerabilities, especially when confronted with\njailbreak attacks. Existing jailbreak methods suffer from two main limitations:\nreliance on complicated prompt engineering and iterative optimization, which\nlead to low attack success rate (ASR) and attack efficiency (AE). In this work,\nwe propose an efficient jailbreak attack method, Analyzing-based Jailbreak\n(ABJ), which leverages the advanced reasoning capability of LLMs to\nautonomously generate harmful content, revealing their underlying safety\nvulnerabilities during complex reasoning process. We conduct comprehensive\nexperiments on ABJ across various open-source and closed-source LLMs. In\nparticular, ABJ achieves high ASR (82.1% on GPT-4o-2024-11-20) with exceptional\nAE among all target LLMs, showcasing its remarkable attack effectiveness,\ntransferability, and efficiency. Our findings underscore the urgent need to\nprioritize and improve the safety of LLMs to mitigate the risks of misuse.\n","authors":["Shi Lin","Hongming Yang","Dingyang Lin","Rongchang Li","Xun Wang","Changting Lin","Wenpeng Xing","Meng Han"],"pdf_url":"https://arxiv.org/pdf/2407.16205v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.07115v3","updated":"2025-03-05T14:43:01Z","published":"2025-02-10T23:11:44Z","title":"Online Scheduling for LLM Inference with KV Cache Constraints","summary":" Large Language Model (LLM) inference, where a trained model generates text\none word at a time in response to user prompts, is a computationally intensive\nprocess requiring efficient scheduling to optimize latency and resource\nutilization. A key challenge in LLM inference is the management of the\nKey-Value (KV) cache, which reduces redundant computations but introduces\nmemory constraints. In this work, we model LLM inference with KV cache\nconstraints theoretically and propose novel batching and scheduling algorithms\nthat minimize inference latency while effectively managing the KV cache's\nmemory.\n We analyze both semi-online and fully online scheduling models, and our\nresults are threefold. First, we provide a polynomial-time algorithm that\nachieves exact optimality in terms of average latency in the semi-online prompt\narrival model. Second, in the fully online case with a stochastic prompt\narrival, we introduce an efficient online scheduling algorithm with constant\nregret. Third, we prove that no algorithm (deterministic or randomized) can\nachieve a constant competitive ratio in fully online adversarial settings. Our\nempirical evaluations on a public LLM inference dataset, using the Llama-70B\nmodel on A100 GPUs, show that our approach significantly outperforms benchmark\nalgorithms used currently in practice, achieving lower latency while reducing\nenergy consumption. Overall, our results offer a path toward more sustainable\nand cost-effective LLM deployment.\n","authors":["Patrick Jaillet","Jiashuo Jiang","Chara Podimata","Zijie Zhou"],"pdf_url":"https://arxiv.org/pdf/2502.07115v3.pdf","comment":"Will add a lemma in the proof of Theorem 5.3 to make the statement\n and proof more rigorous"},{"id":"http://arxiv.org/abs/2502.11681v4","updated":"2025-03-05T14:38:19Z","published":"2025-02-17T11:16:19Z","title":"RIDE: Enhancing Large Language Model Alignment through Restyled\n In-Context Learning Demonstration Exemplars","summary":" Alignment tuning is crucial for ensuring large language models (LLMs) behave\nethically and helpfully. Current alignment approaches require high-quality\nannotations and significant training resources. This paper proposes a low-cost,\ntuning-free method using in-context learning (ICL) to enhance LLM alignment.\nThrough an analysis of high-quality ICL demos, we identified style as a key\nfactor influencing LLM alignment capabilities and explicitly restyled ICL\nexemplars based on this stylistic framework. Additionally, we combined the\nrestyled demos to achieve a balance between the two conflicting aspects of LLM\nalignment--factuality and safety. We packaged the restyled examples as prompts\nto trigger few-shot learning, improving LLM alignment. Compared to the best\nbaseline approach, with an average score of 5.00 as the maximum, our method\nachieves a maximum 0.10 increase on the Alpaca task (from 4.50 to 4.60), a 0.22\nenhancement on the Just-eval benchmark (from 4.34 to 4.56), and a maximum\nimprovement of 0.32 (from 3.53 to 3.85) on the MT-Bench dataset. We release the\ncode and data at https://github.com/AnonymousCode-ComputerScience/RIDE.\n","authors":["Yuncheng Hua","Lizhen Qu","Zhuang Li","Hao Xue","Flora D. Salim","Gholamreza Haffari"],"pdf_url":"https://arxiv.org/pdf/2502.11681v4.pdf","comment":"38 pages, 2 figures, 20 tables; The paper is under review in ARR"},{"id":"http://arxiv.org/abs/2503.03532v1","updated":"2025-03-05T14:14:25Z","published":"2025-03-05T14:14:25Z","title":"AI-Enabled Conversational Journaling for Advancing Parkinson's Disease\n Symptom Tracking","summary":" Journaling plays a crucial role in managing chronic conditions by allowing\npatients to document symptoms and medication intake, providing essential data\nfor long-term care. While valuable, traditional journaling methods often rely\non static, self-directed entries, lacking interactive feedback and real-time\nguidance. This gap can result in incomplete or imprecise information, limiting\nits usefulness for effective treatment. To address this gap, we introduce\nPATRIKA, an AI-enabled prototype designed specifically for people with\nParkinson's disease (PwPD). The system incorporates cooperative conversation\nprinciples, clinical interview simulations, and personalization to create a\nmore effective and user-friendly journaling experience. Through two user\nstudies with PwPD and iterative refinement of PATRIKA, we demonstrate\nconversational journaling's significant potential in patient engagement and\ncollecting clinically valuable information. Our results showed that generating\nprobing questions PATRIKA turned journaling into a bi-directional interaction.\nAdditionally, we offer insights for designing journaling systems for healthcare\nand future directions for promoting sustained journaling.\n","authors":["Mashrur Rashik","Shilpa Sweth","Nishtha Agrawal","Saiyyam Kochar","Kara M Smith","Fateme Rajabiyazdi","Vidya Setlur","Narges Mahyar","Ali Sarvghad"],"pdf_url":"https://arxiv.org/pdf/2503.03532v1.pdf","comment":"To appear in the ACM CHI conference on Human Factors in Computing\n Systems (CHI), 2025"},{"id":"http://arxiv.org/abs/2409.16502v2","updated":"2025-03-05T14:11:44Z","published":"2024-09-24T23:18:32Z","title":"GSplatLoc: Grounding Keypoint Descriptors into 3D Gaussian Splatting for\n Improved Visual Localization","summary":" Although various visual localization approaches exist, such as scene\ncoordinate regression and camera pose regression, these methods often struggle\nwith optimization complexity or limited accuracy. To address these challenges,\nwe explore the use of novel view synthesis techniques, particularly 3D Gaussian\nSplatting (3DGS), which enables the compact encoding of both 3D geometry and\nscene appearance. We propose a two-stage procedure that integrates dense and\nrobust keypoint descriptors from the lightweight XFeat feature extractor into\n3DGS, enhancing performance in both indoor and outdoor environments. The coarse\npose estimates are directly obtained via 2D-3D correspondences between the 3DGS\nrepresentation and query image descriptors. In the second stage, the initial\npose estimate is refined by minimizing the rendering-based photometric warp\nloss. Benchmarking on widely used indoor and outdoor datasets demonstrates\nimprovements over recent neural rendering-based localization methods, such as\nNeRFMatch and PNeRFLoc.\n","authors":["Gennady Sidorov","Malik Mohrat","Denis Gridusov","Ruslan Rakhimov","Sergey Kolyubin"],"pdf_url":"https://arxiv.org/pdf/2409.16502v2.pdf","comment":"Project website at https://gsplatloc.github.io/"},{"id":"http://arxiv.org/abs/2503.03528v1","updated":"2025-03-05T14:11:13Z","published":"2025-03-05T14:11:13Z","title":"AdaSin: Enhancing Hard Sample Metrics with Dual Adaptive Penalty for\n Face Recognition","summary":" In recent years, the emergence of deep convolutional neural networks has\npositioned face recognition as a prominent research focus in computer vision.\nTraditional loss functions, such as margin-based, hard-sample mining-based, and\nhybrid approaches, have achieved notable performance improvements, with some\nleveraging curriculum learning to optimize training. However, these methods\noften fall short in effectively quantifying the difficulty of hard samples. To\naddress this, we propose Adaptive Sine (AdaSin) loss function, which introduces\nthe sine of the angle between a sample's embedding feature and its ground-truth\nclass center as a novel difficulty metric. This metric enables precise and\neffective penalization of hard samples. By incorporating curriculum learning,\nthe model dynamically adjusts classification boundaries across different\ntraining stages. Unlike previous adaptive-margin loss functions, AdaSin\nintroduce a dual adaptive penalty, applied to both the positive and negative\ncosine similarities of hard samples. This design imposes stronger constraints,\nenhancing intra-class compactness and inter-class separability. The combination\nof the dual adaptive penalty and curriculum learning is guided by a\nwell-designed difficulty metric. It enables the model to focus more effectively\non hard samples in later training stages, and lead to the extraction of highly\ndiscriminative face features. Extensive experiments across eight benchmarks\ndemonstrate that AdaSin achieves superior accuracy compared to other\nstate-of-the-art methods.\n","authors":["Qiqi Guo","Zhuowen Zheng","Guanghua Yang","Zhiquan Liu","Xiaofan Li","Jianqing Li","Jinyu Tian","Xueyuan Gong"],"pdf_url":"https://arxiv.org/pdf/2503.03528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09444v3","updated":"2025-03-05T14:02:10Z","published":"2024-01-31T15:37:12Z","title":"Multimodal Action Quality Assessment","summary":" Action quality assessment (AQA) is to assess how well an action is performed.\nPrevious works perform modelling by only the use of visual information,\nignoring audio information. We argue that although AQA is highly dependent on\nvisual information, the audio is useful complementary information for improving\nthe score regression accuracy, especially for sports with background music,\nsuch as figure skating and rhythmic gymnastics. To leverage multimodal\ninformation for AQA, i.e., RGB, optical flow and audio information, we propose\na Progressive Adaptive Multimodal Fusion Network (PAMFN) that separately models\nmodality-specific information and mixed-modality information. Our model\nconsists of with three modality-specific branches that independently explore\nmodality-specific information and a mixed-modality branch that progressively\naggregates the modality-specific information from the modality-specific\nbranches. To build the bridge between modality-specific branches and the\nmixed-modality branch, three novel modules are proposed. First, a\nModality-specific Feature Decoder module is designed to selectively transfer\nmodality-specific information to the mixed-modality branch. Second, when\nexploring the interaction between modality-specific information, we argue that\nusing an invariant multimodal fusion policy may lead to suboptimal results, so\nas to take the potential diversity in different parts of an action into\nconsideration. Therefore, an Adaptive Fusion Module is proposed to learn\nadaptive multimodal fusion policies in different parts of an action. This\nmodule consists of several FusionNets for exploring different multimodal fusion\nstrategies and a PolicyNet for deciding which FusionNets are enabled. Third, a\nmodule called Cross-modal Feature Decoder is designed to transfer cross-modal\nfeatures generated by Adaptive Fusion Module to the mixed-modality branch.\n","authors":["Ling-An Zeng","Wei-Shi Zheng"],"pdf_url":"https://arxiv.org/pdf/2402.09444v3.pdf","comment":"IEEE Transactions on Image Processing 2024"},{"id":"http://arxiv.org/abs/2503.03511v1","updated":"2025-03-05T13:57:37Z","published":"2025-03-05T13:57:37Z","title":"NeuGrasp: Generalizable Neural Surface Reconstruction with Background\n Priors for Material-Agnostic Object Grasp Detection","summary":" Robotic grasping in scenes with transparent and specular objects presents\ngreat challenges for methods relying on accurate depth information. In this\npaper, we introduce NeuGrasp, a neural surface reconstruction method that\nleverages background priors for material-agnostic grasp detection. NeuGrasp\nintegrates transformers and global prior volumes to aggregate multi-view\nfeatures with spatial encoding, enabling robust surface reconstruction in\nnarrow and sparse viewing conditions. By focusing on foreground objects through\nresidual feature enhancement and refining spatial perception with an\noccupancy-prior volume, NeuGrasp excels in handling objects with transparent\nand specular surfaces. Extensive experiments in both simulated and real-world\nscenarios show that NeuGrasp outperforms state-of-the-art methods in grasping\nwhile maintaining comparable reconstruction quality. More details are available\nat https://neugrasp.github.io/.\n","authors":["Qingyu Fan","Yinghao Cai","Chao Li","Wenzhe He","Xudong Zheng","Tao Lu","Bin Liang","Shuo Wang"],"pdf_url":"https://arxiv.org/pdf/2503.03511v1.pdf","comment":"7 pages, 5 figures. IEEE International Conference on Robotics and\n Automation (ICRA) 2025"},{"id":"http://arxiv.org/abs/2503.03506v1","updated":"2025-03-05T13:54:13Z","published":"2025-03-05T13:54:13Z","title":"Rethinking Synthetic Data definitions: A privacy driven approach","summary":" Synthetic data is gaining traction as a cost-effective solution for the\nincreasing data demands of AI development and can be generated either from\nexisting knowledge or derived data captured from real-world events. The source\nof the synthetic data generation and the technique used significantly impacts\nits residual privacy risk and therefore its opportunity for sharing.\nTraditional classification of synthetic data types no longer fit the newer\ngeneration techniques and there is a need to better align the classification\nwith practical needs. We suggest a new way of grouping synthetic data types\nthat better supports privacy evaluations to aid regulatory policymaking. Our\nnovel classification provides flexibility to new advancements like deep\ngenerative methods and offers a more practical framework for future\napplications.\n","authors":["Vibeke Binz Vallevik","Serena Elizabeth Marshall","Aleksandar Babic","Jan Franz Nygaard"],"pdf_url":"https://arxiv.org/pdf/2503.03506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03505v1","updated":"2025-03-05T13:53:10Z","published":"2025-03-05T13:53:10Z","title":"Parallelized Planning-Acting for Efficient LLM-based Multi-Agent Systems","summary":" Recent advancements in Large Language Model(LLM)-based Multi-Agent\nSystems(MAS) have demonstrated remarkable potential for tackling complex\ndecision-making tasks. However, existing frameworks inevitably rely on\nserialized execution paradigms, where agents must complete sequential LLM\nplanning before taking action. This fundamental constraint severely limits\nreal-time responsiveness and adaptation, which is crucial in dynamic\nenvironments with ever-changing scenarios. In this paper, we propose a novel\nparallelized planning-acting framework for LLM-based MAS, featuring a\ndual-thread architecture with interruptible execution to enable concurrent\nplanning and acting. Specifically, our framework comprises two core threads:(1)\na planning thread driven by a centralized memory system, maintaining\nsynchronization of environmental states and agent communication to support\ndynamic decision-making; and (2) an acting thread equipped with a comprehensive\nskill library, enabling automated task execution through recursive\ndecomposition. Extensive experiments on challenging Minecraft demonstrate the\neffectiveness of the proposed framework.\n","authors":["Yaoru Li","Shunyu Liu","Tongya Zheng","Mingli Song"],"pdf_url":"https://arxiv.org/pdf/2503.03505v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03503v1","updated":"2025-03-05T13:47:55Z","published":"2025-03-05T13:47:55Z","title":"Collaborative Expert LLMs Guided Multi-Objective Molecular Optimization","summary":" Molecular optimization is a crucial yet complex and time-intensive process\nthat often acts as a bottleneck for drug development. Traditional methods rely\nheavily on trial and error, making multi-objective optimization both\ntime-consuming and resource-intensive. Current AI-based methods have shown\nlimited success in handling multi-objective optimization tasks, hampering their\npractical utilization. To address this challenge, we present MultiMol, a\ncollaborative large language model (LLM) system designed to guide\nmulti-objective molecular optimization. MultiMol comprises two agents,\nincluding a data-driven worker agent and a literature-guided research agent.\nThe data-driven worker agent is a large language model being fine-tuned to\nlearn how to generate optimized molecules considering multiple objectives,\nwhile the literature-guided research agent is responsible for searching\ntask-related literature to find useful prior knowledge that facilitates\nidentifying the most promising optimized candidates. In evaluations across six\nmulti-objective optimization tasks, MultiMol significantly outperforms existing\nmethods, achieving a 82.30% success rate, in sharp contrast to the 27.50%\nsuccess rate of current strongest methods. To further validate its practical\nimpact, we tested MultiMol on two real-world challenges. First, we enhanced the\nselectivity of Xanthine Amine Congener (XAC), a promiscuous ligand that binds\nboth A1R and A2AR, successfully biasing it towards A1R. Second, we improved the\nbioavailability of Saquinavir, an HIV-1 protease inhibitor with known\nbioavailability limitations. Overall, these results indicate that MultiMol\nrepresents a highly promising approach for multi-objective molecular\noptimization, holding great potential to accelerate the drug development\nprocess and contribute to the advancement of pharmaceutical research.\n","authors":["Jiajun Yu","Yizhen Zheng","Huan Yee Koh","Shirui Pan","Tianyue Wang","Haishuai Wang"],"pdf_url":"https://arxiv.org/pdf/2503.03503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03502v1","updated":"2025-03-05T13:47:53Z","published":"2025-03-05T13:47:53Z","title":"CURVALID: Geometrically-guided Adversarial Prompt Detection","summary":" Adversarial prompts capable of jailbreaking large language models (LLMs) and\ninducing undesirable behaviours pose a significant obstacle to their safe\ndeployment. Current mitigation strategies rely on activating built-in defence\nmechanisms or fine-tuning the LLMs, but the fundamental distinctions between\nadversarial and benign prompts are yet to be understood. In this work, we\nintroduce CurvaLID, a novel defense framework that efficiently detects\nadversarial prompts by leveraging their geometric properties. It is agnostic to\nthe type of LLM, offering a unified detection framework across diverse\nadversarial prompts and LLM architectures. CurvaLID builds on the geometric\nanalysis of text prompts to uncover their underlying differences. We\ntheoretically extend the concept of curvature via the Whewell equation into an\n$n$-dimensional word embedding space, enabling us to quantify local geometric\nproperties, including semantic shifts and curvature in the underlying\nmanifolds. Additionally, we employ Local Intrinsic Dimensionality (LID) to\ncapture geometric features of text prompts within adversarial subspaces. Our\nfindings reveal that adversarial prompts differ fundamentally from benign\nprompts in terms of their geometric characteristics. Our results demonstrate\nthat CurvaLID delivers superior detection and rejection of adversarial queries,\npaving the way for safer LLM deployment. The source code can be found at\nhttps://github.com/Cancanxxx/CurvaLID\n","authors":["Canaan Yung","Hanxun Huang","Sarah Monazam Erfani","Christopher Leckie"],"pdf_url":"https://arxiv.org/pdf/2503.03502v1.pdf","comment":"29 Pages, 5 figues"},{"id":"http://arxiv.org/abs/2412.18355v2","updated":"2025-03-05T13:25:09Z","published":"2024-12-24T11:35:40Z","title":"Handling Spatial-Temporal Data Heterogeneity for Federated Continual\n Learning via Tail Anchor","summary":" Federated continual learning (FCL) allows each client to continually update\nits knowledge from task streams, enhancing the applicability of federated\nlearning in real-world scenarios. However, FCL needs to address not only\nspatial data heterogeneity between clients but also temporal data heterogeneity\nbetween tasks. In this paper, empirical experiments demonstrate that such\ninput-level heterogeneity significantly affects the model's internal parameters\nand outputs, leading to severe spatial-temporal catastrophic forgetting of\nlocal and previous knowledge. To this end, we propose Federated Tail Anchor\n(FedTA) to mix trainable Tail Anchor with the frozen output features to adjust\ntheir position in the feature space, thereby overcoming parameter-forgetting\nand output-forgetting. Three novel components are also included: Input\nEnhancement for improving the performance of pre-trained models on downstream\ntasks; Selective Input Knowledge Fusion for fusion of heterogeneous local\nknowledge on the server; and Best Global Prototype Selection for finding the\nbest anchor point for each class in the feature space. Extensive experiments\ndemonstrate that FedTA not only outperforms existing FCL methods but also\neffectively preserves the relative positions of features.\n","authors":["Hao Yu","Xin Yang","Le Zhang","Hanlin Gu","Tianrui Li","Lixin Fan","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2412.18355v2.pdf","comment":"This paper is accepted by CVPR 2025"},{"id":"http://arxiv.org/abs/2502.20475v2","updated":"2025-03-05T13:22:47Z","published":"2025-02-27T19:23:15Z","title":"Promote, Suppress, Iterate: How Language Models Answer One-to-Many\n Factual Queries","summary":" To answer one-to-many factual queries (e.g., listing cities of a country), a\nlanguage model (LM) must simultaneously recall knowledge and avoid repeating\nprevious answers. How are these two subtasks implemented and integrated\ninternally? Across multiple datasets and models, we identify a\npromote-then-suppress mechanism: the model first recalls all answers, and then\nsuppresses previously generated ones. Specifically, LMs use both the subject\nand previous answer tokens to perform knowledge recall, with attention\npropagating subject information and MLPs promoting the answers. Then, attention\nattends to and suppresses previous answer tokens, while MLPs amplify the\nsuppression signal. Our mechanism is corroborated by extensive experimental\nevidence: in addition to using early decoding and causal tracing, we analyze\nhow components use different tokens by introducing both Token Lens, which\ndecodes aggregated attention updates from specified tokens, and a knockout\nmethod that analyzes changes in MLP outputs after removing attention to\nspecified tokens. Overall, we provide new insights into how LMs' internal\ncomponents interact with different input tokens to support complex factual\nrecall. Code is available at\nhttps://github.com/Lorenayannnnn/how-lms-answer-one-to-many-factual-queries.\n","authors":["Tianyi Lorena Yan","Robin Jia"],"pdf_url":"https://arxiv.org/pdf/2502.20475v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02393v3","updated":"2025-03-05T13:19:16Z","published":"2025-01-04T22:30:21Z","title":"Graph-Aware Isomorphic Attention for Adaptive Dynamics in Transformers","summary":" We present an approach to modifying Transformer architectures by integrating\ngraph-aware relational reasoning into the attention mechanism, merging concepts\nfrom graph neural networks and language modeling. Building on the inherent\nconnection between attention and graph theory, we reformulate the Transformer's\nattention mechanism as a graph operation and propose Graph-Aware Isomorphic\nAttention. This method leverages advanced graph modeling strategies, including\nGraph Isomorphism Networks (GIN) and Principal Neighborhood Aggregation (PNA),\nto enrich the representation of relational structures. Our approach captures\ncomplex dependencies and generalizes across tasks, as evidenced by a reduced\ngeneralization gap and improved learning performance. Additionally, we expand\nthe concept of graph-aware attention to introduce Sparse GIN-Attention, a\nfine-tuning approach that employs sparse GINs. By interpreting attention\nmatrices as sparse adjacency graphs, this technique enhances the adaptability\nof pre-trained foundational models with minimal computational overhead,\nendowing them with graph-aware capabilities. Sparse GIN-Attention fine-tuning\nachieves improved training dynamics and better generalization compared to\nalternative methods like low-rank adaption (LoRA). We discuss latent graph-like\nstructures within traditional attention mechanisms, offering a new lens through\nwhich Transformers can be understood. By evolving Transformers as hierarchical\nGIN models for relational reasoning. This perspective suggests profound\nimplications for foundational model development, enabling the design of\narchitectures that dynamically adapt to both local and global dependencies.\nApplications in bioinformatics, materials science, language modeling, and\nbeyond could benefit from this synthesis of relational and sequential data\nmodeling, setting the stage for interpretable and generalizable modeling\nstrategies.\n","authors":["Markus J. Buehler"],"pdf_url":"https://arxiv.org/pdf/2501.02393v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03480v1","updated":"2025-03-05T13:16:55Z","published":"2025-03-05T13:16:55Z","title":"SafeVLA: Towards Safety Alignment of Vision-Language-Action Model via\n Safe Reinforcement Learning","summary":" Vision-language-action models (VLAs) have shown great potential as generalist\nrobot policies. However, these models pose urgent safety challenges during\ndeployment, including the risk of physical harm to the environment, the robot\nitself, and humans. How can safety be explicitly incorporated into VLAs? In\nthis work, we propose SafeVLA, a novel algorithm designed to integrate safety\ninto VLAs, ensuring the protection of the environment, robot hardware and\nhumans in real-world settings. SafeVLA effectively balances safety and task\nperformance by employing large-scale constrained learning within simulated\nenvironments. We demonstrate that SafeVLA outperforms the current\nstate-of-the-art method in both safety and task performance, achieving average\nimprovements of 83.58% and 3.85%, respectively, in simulation. By prioritizing\nsafety, our approach eliminates high-risk behaviors and reduces the upper bound\nof unsafe behaviors to 1/35 of that in the current state-of-the-art, thereby\nsignificantly mitigating long-tail risks. Furthermore, the learned safety\nconstraints generalize to diverse, unseen scenarios, including multiple\nout-of-distribution perturbations and tasks. Our data, models and newly\nproposed benchmark environment are available at\nhttps://sites.google.com/view/pku-safevla.\n","authors":["Borong Zhang","Yuhao Zhang","Jiaming Ji","Yingshan Lei","Josef Dai","Yuanpei Chen","Yaodong Yang"],"pdf_url":"https://arxiv.org/pdf/2503.03480v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2503.03462v1","updated":"2025-03-05T12:52:14Z","published":"2025-03-05T12:52:14Z","title":"Open-Source Large Language Models as Multilingual Crowdworkers:\n Synthesizing Open-Domain Dialogues in Several Languages With No Examples in\n Targets and No Machine Translation","summary":" The prevailing paradigm in the domain of Open-Domain Dialogue agents\npredominantly focuses on the English language, encompassing both models and\ndatasets. Furthermore, the financial and temporal investments required for\ncrowdsourcing such datasets for finetuning are substantial, particularly when\nmultiple languages are involved. Fortunately, advancements in Large Language\nModels (LLMs) have unveiled a plethora of possibilities across diverse tasks.\nSpecifically, instruction-tuning has enabled LLMs to execute tasks based on\nnatural language instructions, occasionally surpassing the performance of human\ncrowdworkers. Additionally, these models possess the capability to function in\nvarious languages within a single thread. Consequently, to generate new samples\nin different languages, we propose leveraging these capabilities to replicate\nthe data collection process. We introduce a pipeline for generating Open-Domain\nDialogue data in multiple Target Languages using LLMs, with demonstrations\nprovided in a unique Source Language. By eschewing explicit Machine Translation\nin this approach, we enhance the adherence to language-specific nuances. We\napply this methodology to the PersonaChat dataset. To enhance the openness of\ngenerated dialogues and mimic real life scenarii, we added the notion of speech\nevents corresponding to the type of conversation the speakers are involved in\nand also that of common ground which represents the premises of a conversation.\n","authors":["Ahmed Njifenjou","Virgile Sucal","Bassam Jabaian","Fabrice Lefèvre"],"pdf_url":"https://arxiv.org/pdf/2503.03462v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03459v1","updated":"2025-03-05T12:49:44Z","published":"2025-03-05T12:49:44Z","title":"Unified Mind Model: Reimagining Autonomous Agents in the LLM Era","summary":" Large language models (LLMs) have recently demonstrated remarkable\ncapabilities across domains, tasks, and languages (e.g., ChatGPT and GPT-4),\nreviving the research of general autonomous agents with human-like cognitive\nabilities.Such human-level agents require semantic comprehension and\ninstruction-following capabilities, which exactly fall into the strengths of\nLLMs.Although there have been several initial attempts to build human-level\nagents based on LLMs, the theoretical foundation remains a challenging open\nproblem. In this paper, we propose a novel theoretical cognitive architecture,\nthe Unified Mind Model (UMM), which offers guidance to facilitate the rapid\ncreation of autonomous agents with human-level cognitive abilities.\nSpecifically, our UMM starts with the global workspace theory and further\nleverage LLMs to enable the agent with various cognitive abilities, such as\nmulti-modal perception, planning, reasoning, tool use, learning, memory,\nreflection and motivation. Building upon UMM, we then develop an agent-building\nengine, MindOS, which allows users to quickly create domain-/task-specific\nautonomous agents without any programming effort.\n","authors":["Pengbo Hu","Xiang Ying"],"pdf_url":"https://arxiv.org/pdf/2503.03459v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2403.10860v2","updated":"2025-03-05T12:41:05Z","published":"2024-03-16T08:57:00Z","title":"Sim2Real within 5 Minutes: Efficient Domain Transfer with Stylized\n Gaussian Splatting for Endoscopic Images","summary":" Robot assisted endoluminal intervention is an emerging technique for both\nbenign and malignant luminal lesions. With vision-based navigation, when\ncombined with pre-operative imaging data as priors, it is possible to recover\nposition and pose of the endoscope without the need of additional sensors. In\npractice, however, aligning pre-operative and intra-operative domains is\ncomplicated by significant texture differences. Although methods such as style\ntransfer can be used to address this issue, they require large datasets from\nboth source and target domains with prolonged training times. This paper\nproposes an efficient domain transfer method based on stylized Gaussian\nsplatting, only requiring a few of real images (10 images) with very fast\ntraining time. Specifically, the transfer process includes two phases. In the\nfirst phase, the 3D models reconstructed from CT scans are represented as\ndifferential Gaussian point clouds. In the second phase, only color appearance\nrelated parameters are optimized to transfer the style and preserve the visual\ncontent. A novel structure consistency loss is applied to latent features and\ndepth levels to enhance the stability of the transferred images. Detailed\nvalidation was performed to demonstrate the performance advantages of the\nproposed method compared to that of the current state-of-the-art, highlighting\nthe potential for intra-operative surgical navigation.\n","authors":["Junyang Wu","Yun Gu","Guang-Zhong Yang"],"pdf_url":"https://arxiv.org/pdf/2403.10860v2.pdf","comment":"Accepted by ICRA 2025"},{"id":"http://arxiv.org/abs/2502.05503v3","updated":"2025-03-05T12:27:57Z","published":"2025-02-08T09:31:26Z","title":"A Physical Coherence Benchmark for Evaluating Video Generation Models\n via Optical Flow-guided Frame Prediction","summary":" Recent advances in video generation models demonstrate their potential as\nworld simulators, but they often struggle with videos deviating from physical\nlaws, a key concern overlooked by most text-to-video benchmarks. We introduce a\nbenchmark designed specifically to assess the Physical Coherence of generated\nvideos, PhyCoBench. Our benchmark includes 120 prompts covering 7 categories of\nphysical principles, capturing key physical laws observable in video content.\nWe evaluated four state-of-the-art (SoTA) T2V models on PhyCoBench and\nconducted manual assessments. Additionally, we propose an automated evaluation\nmodel: PhyCoPredictor, a diffusion model that generates optical flow and video\nframes in a cascade manner. Through a consistency evaluation comparing\nautomated and manual sorting, the experimental results show that PhyCoPredictor\ncurrently aligns most closely with human evaluation. Therefore, it can\neffectively evaluate the physical coherence of videos, providing insights for\nfuture model optimization. Our benchmark, including physical coherence prompts,\nthe automatic evaluation tool PhyCoPredictor, and the generated video dataset,\nhas been released on GitHub at https://github.com/Jeckinchen/PhyCoBench.\n","authors":["Yongfan Chen","Xiuwen Zhu","Tianyu Li"],"pdf_url":"https://arxiv.org/pdf/2502.05503v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03444v1","updated":"2025-03-05T12:24:20Z","published":"2025-03-05T12:24:20Z","title":"Taxation Perspectives from Large Language Models: A Case Study on\n Additional Tax Penalties","summary":" How capable are large language models (LLMs) in the domain of taxation?\nAlthough numerous studies have explored the legal domain in general, research\ndedicated to taxation remain scarce. Moreover, the datasets used in these\nstudies are either simplified, failing to reflect the real-world complexities,\nor unavailable as open source. To address this gap, we introduce PLAT, a new\nbenchmark designed to assess the ability of LLMs to predict the legitimacy of\nadditional tax penalties. PLAT is constructed to evaluate LLMs' understanding\nof tax law, particularly in cases where resolving the issue requires more than\njust applying related statutes. Our experiments with six LLMs reveal that their\nbaseline capabilities are limited, especially when dealing with conflicting\nissues that demand a comprehensive understanding. However, we found that\nenabling retrieval, self-reasoning, and discussion among multiple agents with\nspecific role assignments, this limitation can be mitigated.\n","authors":["Eunkyung Choi","Young Jin Suh","Hun Park","Wonseok Hwang"],"pdf_url":"https://arxiv.org/pdf/2503.03444v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2503.03443v1","updated":"2025-03-05T12:24:12Z","published":"2025-03-05T12:24:12Z","title":"Conceptualizing Uncertainty","summary":" Uncertainty in machine learning refers to the degree of confidence or lack\nthereof in a model's predictions. While uncertainty quantification methods\nexist, explanations of uncertainty, especially in high-dimensional settings,\nremain an open challenge. Existing work focuses on feature attribution\napproaches which are restricted to local explanations. Understanding\nuncertainty, its origins, and characteristics on a global scale is crucial for\nenhancing interpretability and trust in a model's predictions. In this work, we\npropose to explain the uncertainty in high-dimensional data classification\nsettings by means of concept activation vectors which give rise to local and\nglobal explanations of uncertainty. We demonstrate the utility of the generated\nexplanations by leveraging them to refine and improve our model.\n","authors":["Isaac Roberts","Alexander Schulz","Sarah Schroeder","Fabian Hinder","Barbara Hammer"],"pdf_url":"https://arxiv.org/pdf/2503.03443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10069v2","updated":"2025-03-05T12:22:23Z","published":"2025-01-17T09:42:48Z","title":"A Survey on LLM Test-Time Compute via Search: Tasks, LLM Profiling,\n Search Algorithms, and Relevant Frameworks","summary":" LLM test-time compute (or LLM inference) via search has emerged as a\npromising research area with rapid developments. However, current frameworks\noften adopt distinct perspectives on three key aspects (task definition, LLM\nprofiling, and search procedures), making direct comparisons challenging.\nMoreover, the search algorithms employed often diverge from standard\nimplementations, and their specific characteristics are not thoroughly\nspecified. In this survey, we provide a comprehensive technical review that\nunifies task definitions and provides modular definitions of LLM profiling and\nsearch procedures. The definitions enable precise comparisons of various LLM\ninference frameworks while highlighting their departures from conventional\nsearch algorithms. We also discuss the applicability, performance, and\nefficiency of these methods. We have updated our content to include the latest\npapers, and the differences between versions are highlighted in the appendix.\nFor further details and ongoing updates, please refer to our GitHub repository:\nhttps://github.com/xinzhel/LLM-Agent-Survey/blob/main/search.md\n","authors":["Xinzhe Li"],"pdf_url":"https://arxiv.org/pdf/2501.10069v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03434v1","updated":"2025-03-05T12:10:14Z","published":"2025-03-05T12:10:14Z","title":"RASD: Retrieval-Augmented Speculative Decoding","summary":" Speculative decoding accelerates inference in large language models (LLMs) by\ngenerating draft tokens for target model verification. Current approaches for\nobtaining draft tokens rely on lightweight draft models or additional model\nstructures to generate draft tokens and retrieve context from databases. Due to\nthe draft model's small size and limited training data, model-based speculative\ndecoding frequently becomes less effective in out-of-domain scenarios.\nAdditionally, the time cost of the drafting phase results in a low upper limit\non acceptance length during the verification step, limiting overall efficiency.\nThis paper proposes RASD (Retrieval-Augmented Speculative Decoding), which\nadopts retrieval methods to enhance model-based speculative decoding. We\nintroduce tree pruning and tree fusion to achieve this. Specifically, we\ndevelop a pruning method based on the draft model's probability distribution to\nconstruct the optimal retrieval tree. Second, we employ the longest prefix\nmatching algorithm to merge the tree generated by the draft model with the\nretrieval tree, resulting in a unified tree for verification. Experimental\nresults demonstrate that RASD achieves state-of-the-art inference acceleration\nacross tasks such as DocQA, Summary, Code, and In-Domain QA. Moreover, RASD\nexhibits strong scalability, seamlessly integrating with various speculative\ndecoding approaches, including both generation-based and retrieval-based\nmethods.\n","authors":["Guofeng Quan","Wenfeng Feng","Chuzhan Hao","Guochao Jiang","Yuewei Zhang","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2503.03434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03428v1","updated":"2025-03-05T12:01:22Z","published":"2025-03-05T12:01:22Z","title":"Privacy is All You Need: Revolutionizing Wearable Health Data with\n Advanced PETs","summary":" In a world where data is the new currency, wearable health devices offer\nunprecedented insights into daily life, continuously monitoring vital signs and\nmetrics. However, this convenience raises privacy concerns, as these devices\ncollect sensitive data that can be misused or breached. Traditional measures\noften fail due to real-time data processing needs and limited device power.\nUsers also lack awareness and control over data sharing and usage. We propose a\nPrivacy-Enhancing Technology (PET) framework for wearable devices, integrating\nfederated learning, lightweight cryptographic methods, and selectively deployed\nblockchain technology. The blockchain acts as a secure ledger triggered only\nupon data transfer requests, granting users real-time notifications and\ncontrol. By dismantling data monopolies, this approach returns data sovereignty\nto individuals. Through real-world applications like secure medical data\nsharing, privacy-preserving fitness tracking, and continuous health monitoring,\nour framework reduces privacy risks by up to 70 percent while preserving data\nutility and performance. This innovation sets a new benchmark for wearable\nprivacy and can scale to broader IoT ecosystems, including smart homes and\nindustry. As data continues to shape our digital landscape, our research\nunderscores the critical need to maintain privacy and user control at the\nforefront of technological progress.\n","authors":["Karthik Barma","Seshu Babu Barma"],"pdf_url":"https://arxiv.org/pdf/2503.03428v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.00735v3","updated":"2025-03-05T11:50:24Z","published":"2025-03-02T05:16:43Z","title":"LADDER: Self-Improving LLMs Through Recursive Problem Decomposition","summary":" We introduce LADDER (Learning through Autonomous Difficulty-Driven Example\nRecursion), a framework which enables Large Language Models to autonomously\nimprove their problem-solving capabilities through self-guided learning by\nrecursively generating and solving progressively simpler variants of complex\nproblems. Unlike prior approaches that require curated datasets or human\nfeedback, LADDER leverages a model's own capabilities to generate easier\nquestion variants. We demonstrate LADDER's effectiveness in the subject of\nmathematical integration, improving Llama 3.2 3B's accuracy from 1% to 82% on\nundergraduate-level problems and enabling Qwen2.5 7B Deepseek-R1 Distilled to\nachieve 73% on the MIT Integration Bee qualifying examination. We also\nintroduce TTRL (Test-Time Reinforcement Learning), where we perform\nreinforcement learning on variants of test problems at inference time. TTRL\nenables Qwen2.5 7B Deepseek-R1 Distilled to achieve a state-of-the-art score of\n90% on the MIT Integration Bee qualifying examination, surpassing OpenAI o1's\nperformance. These results show how self-directed strategic learning can\nachieve significant capability improvements without relying on architectural\nscaling or human supervision.\n","authors":["Toby Simonds","Akira Yoshiyama"],"pdf_url":"https://arxiv.org/pdf/2503.00735v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.18377v3","updated":"2025-03-05T11:49:36Z","published":"2024-12-24T12:03:36Z","title":"ChaI-TeA: A Benchmark for Evaluating Autocompletion of Interactions with\n LLM-based Chatbots","summary":" The rise of LLMs has deflected a growing portion of human-computer\ninteractions towards LLM-based chatbots. The remarkable abilities of these\nmodels allow users to interact using long, diverse natural language text\ncovering a wide range of topics and styles. Phrasing these messages is a time\nand effort consuming task, calling for an autocomplete solution to assist\nusers. We introduce the task of chatbot interaction autocomplete. We present\nChaI-TeA: CHat InTEraction Autocomplete; An autcomplete evaluation framework\nfor LLM-based chatbot interactions. The framework includes a formal definition\nof the task, coupled with suitable datasets and metrics. We use the framework\nto evaluate After formally defining the task along with suitable datasets and\nmetrics, we test 9 models on the defined auto completion task, finding that\nwhile current off-the-shelf models perform fairly, there is still much room for\nimprovement, mainly in ranking of the generated suggestions. We provide\ninsights for practitioners working on this task and open new research\ndirections for researchers in the field. We release our framework to serve as a\nfoundation for future research.\n","authors":["Shani Goren","Oren Kalinsky","Tomer Stav","Yuri Rapoport","Yaron Fairstein","Ram Yazdi","Nachshon Cohen","Alexander Libov","Guy Kushilevitz"],"pdf_url":"https://arxiv.org/pdf/2412.18377v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03418v1","updated":"2025-03-05T11:47:41Z","published":"2025-03-05T11:47:41Z","title":"Simplicial SMOTE: Oversampling Solution to the Imbalanced Learning\n Problem","summary":" SMOTE (Synthetic Minority Oversampling Technique) is the established\ngeometric approach to random oversampling to balance classes in the imbalanced\nlearning problem, followed by many extensions. Its idea is to introduce\nsynthetic data points of the minor class, with each new point being the convex\ncombination of an existing data point and one of its k-nearest neighbors. In\nthis paper, by viewing SMOTE as sampling from the edges of a geometric\nneighborhood graph and borrowing tools from the topological data analysis, we\npropose a novel technique, Simplicial SMOTE, that samples from the simplices of\na geometric neighborhood simplicial complex. A new synthetic point is defined\nby the barycentric coordinates w.r.t. a simplex spanned by an arbitrary number\nof data points being sufficiently close rather than a pair. Such a replacement\nof the geometric data model results in better coverage of the underlying data\ndistribution compared to existing geometric sampling methods and allows the\ngeneration of synthetic points of the minority class closer to the majority\nclass on the decision boundary. We experimentally demonstrate that our\nSimplicial SMOTE outperforms several popular geometric sampling methods,\nincluding the original SMOTE. Moreover, we show that simplicial sampling can be\neasily integrated into existing SMOTE extensions. We generalize and evaluate\nsimplicial extensions of the classic Borderline SMOTE, Safe-level SMOTE, and\nADASYN algorithms, all of which outperform their graph-based counterparts.\n","authors":["Oleg Kachan","Andrey Savchenko","Gleb Gusev"],"pdf_url":"https://arxiv.org/pdf/2503.03418v1.pdf","comment":"Accepted at KDD 2025 (research track)"},{"id":"http://arxiv.org/abs/2503.03417v1","updated":"2025-03-05T11:47:32Z","published":"2025-03-05T11:47:32Z","title":"When Claims Evolve: Evaluating and Enhancing the Robustness of Embedding\n Models Against Misinformation Edits","summary":" Online misinformation remains a critical challenge, and fact-checkers\nincreasingly rely on embedding-based methods to retrieve relevant fact-checks.\nYet, when debunked claims reappear in edited forms, the performance of these\nmethods is unclear. In this work, we introduce a taxonomy of six common\nreal-world misinformation edits and propose a perturbation framework that\ngenerates valid, natural claim variations. Our multi-stage retrieval evaluation\nreveals that standard embedding models struggle with user-introduced edits,\nwhile LLM-distilled embeddings offer improved robustness at a higher\ncomputational cost. Although a strong reranker helps mitigate some issues, it\ncannot fully compensate for first-stage retrieval gaps. Addressing these\nretrieval gaps, our train- and inference-time mitigation approaches enhance\nin-domain robustness by up to 17 percentage points and boost out-of-domain\ngeneralization by 10 percentage points over baseline models. Overall, our\nfindings provide practical improvements to claim-matching systems, enabling\nmore reliable fact-checking of evolving misinformation.\n","authors":["Jabez Magomere","Emanuele La Malfa","Manuel Tonneau","Ashkan Kazemi","Scott Hale"],"pdf_url":"https://arxiv.org/pdf/2503.03417v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03410v1","updated":"2025-03-05T11:39:15Z","published":"2025-03-05T11:39:15Z","title":"Augmentation-Based Deep Learning for Identification of Circulating Tumor\n Cells","summary":" Circulating tumor cells (CTCs) are crucial biomarkers in liquid biopsy,\noffering a noninvasive tool for cancer patient management. However, their\nidentification remains particularly challenging due to their limited number and\nheterogeneity. Labeling samples for contrast limits the generalization of\nfluorescence-based methods across different hospital datasets. Analyzing\nsingle-cell images enables detailed assessment of cell morphology, subcellular\nstructures, and phenotypic variations, often hidden in clustered images.\nDeveloping a method based on bright-field single-cell analysis could overcome\nthese limitations. CTCs can be isolated using an unbiased workflow combining\nParsortix technology, which selects cells based on size and deformability, with\nDEPArray technology, enabling precise visualization and selection of single\ncells. Traditionally, DEPArray-acquired digital images are manually analyzed,\nmaking the process time-consuming and prone to variability. In this study, we\npresent a Deep Learning-based classification pipeline designed to distinguish\nCTCs from leukocytes in blood samples, aimed to enhance diagnostic accuracy and\noptimize clinical workflows. Our approach employs images from the bright-field\nchannel acquired through DEPArray technology leveraging a ResNet-based CNN. To\nimprove model generalization, we applied three types of data augmentation\ntechniques and incorporated fluorescence (DAPI) channel images into the\ntraining phase, allowing the network to learn additional CTC-specific features.\nNotably, only bright-field images have been used for testing, ensuring the\nmodel's ability to identify CTCs without relying on fluorescence markers. The\nproposed model achieved an F1-score of 0.798, demonstrating its capability to\ndistinguish CTCs from leukocytes. These findings highlight the potential of DL\nin refining CTC analysis and advancing liquid biopsy applications.\n","authors":["Martina Russo","Giulia Bertolini","Vera Cappelletti","Cinzia De Marco","Serena Di Cosimo","Petra Paiè","Nadia Brancati"],"pdf_url":"https://arxiv.org/pdf/2503.03410v1.pdf","comment":"20 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2412.14566v2","updated":"2025-03-05T11:38:00Z","published":"2024-12-19T06:35:54Z","title":"AIArena: A Blockchain-Based Decentralized AI Training Platform","summary":" The rapid advancement of AI has underscored critical challenges in its\ndevelopment and implementation, largely due to centralized control by a few\nmajor corporations. This concentration of power intensifies biases within AI\nmodels, resulting from inadequate governance and oversight mechanisms.\nAdditionally, it limits public involvement and heightens concerns about the\nintegrity of model generation. Such monopolistic control over data and AI\noutputs threatens both innovation and fair data usage, as users inadvertently\ncontribute data that primarily benefits these corporations. In this work, we\npropose AIArena, a blockchain-based decentralized AI training platform designed\nto democratize AI development and alignment through on-chain incentive\nmechanisms. AIArena fosters an open and collaborative environment where\nparticipants can contribute models and computing resources. Its on-chain\nconsensus mechanism ensures fair rewards for participants based on their\ncontributions. We instantiate and implement AIArena on the public Base\nblockchain Sepolia testnet, and the evaluation results demonstrate the\nfeasibility of AIArena in real-world applications.\n","authors":["Zhipeng Wang","Rui Sun","Elizabeth Lui","Tuo Zhou","Yizhe Wen","Jiahao Sun"],"pdf_url":"https://arxiv.org/pdf/2412.14566v2.pdf","comment":"Camera ready version. Accepted by the ACM Web Conference (WWW), 2025"},{"id":"http://arxiv.org/abs/2503.03395v1","updated":"2025-03-05T11:19:17Z","published":"2025-03-05T11:19:17Z","title":"AI-Driven Multi-Stage Computer Vision System for Defect Detection in\n Laser-Engraved Industrial Nameplates","summary":" Automated defect detection in industrial manufacturing is essential for\nmaintaining product quality and minimizing production errors. In air disc brake\nmanufacturing, ensuring the precision of laser-engraved nameplates is crucial\nfor accurate product identification and quality control. Engraving errors, such\nas misprints or missing characters, can compromise both aesthetics and\nfunctionality, leading to material waste and production delays. This paper\npresents a proof of concept for an AI-driven computer vision system that\ninspects and verifies laser-engraved nameplates, detecting defects in logos and\nalphanumeric strings. The system integrates object detection using YOLOv7,\noptical character recognition (OCR) with Tesseract, and anomaly detection\nthrough a residual variational autoencoder (ResVAE) along with other computer\nvision methods to enable comprehensive inspections at multiple stages.\nExperimental results demonstrate the system's effectiveness, achieving 91.33%\naccuracy and 100% recall, ensuring that defective nameplates are consistently\ndetected and addressed. This solution highlights the potential of AI-driven\nvisual inspection to enhance quality control, reduce manual inspection efforts,\nand improve overall manufacturing efficiency.\n","authors":["Adhish Anitha Vilasan","Stephan Jäger","Noah Klarmann"],"pdf_url":"https://arxiv.org/pdf/2503.03395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01243v3","updated":"2025-03-05T11:17:18Z","published":"2024-12-02T08:05:26Z","title":"Schedule On the Fly: Diffusion Time Prediction for Faster and Better\n Image Generation","summary":" Diffusion and flow matching models have achieved remarkable success in\ntext-to-image generation. However, these models typically rely on the\npredetermined denoising schedules for all prompts. The multi-step reverse\ndiffusion process can be regarded as a kind of chain-of-thought for generating\nhigh-quality images step by step. Therefore, diffusion models should reason for\neach instance to adaptively determine the optimal noise schedule, achieving\nhigh generation quality with sampling efficiency. In this paper, we introduce\nthe Time Prediction Diffusion Model (TPDM) for this. TPDM employs a\nplug-and-play Time Prediction Module (TPM) that predicts the next noise level\nbased on current latent features at each denoising step. We train the TPM using\nreinforcement learning to maximize a reward that encourages high final image\nquality while penalizing excessive denoising steps. With such an adaptive\nscheduler, TPDM not only generates high-quality images that are aligned closely\nwith human preferences but also adjusts diffusion time and the number of\ndenoising steps on the fly, enhancing both performance and efficiency. With\nStable Diffusion 3 Medium architecture, TPDM achieves an aesthetic score of\n5.44 and a human preference score (HPS) of 29.59, while using around 50% fewer\ndenoising steps to achieve better performance.\n","authors":["Zilyu Ye","Zhiyang Chen","Tiancheng Li","Zemin Huang","Weijian Luo","Guo-Jun Qi"],"pdf_url":"https://arxiv.org/pdf/2412.01243v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14507v2","updated":"2025-03-05T11:15:58Z","published":"2024-08-24T16:54:08Z","title":"Prompt-Matcher: Leveraging Large Models to Reduce Uncertainty in Schema\n Matching Results","summary":" Schema matching is the process of identifying correspondences between the\nelements of two given schemata, essential for database management systems, data\nintegration, and data warehousing. For datasets across different scenarios, the\noptimal schema matching algorithm is different. For single algorithm,\nhyperparameter tuning also cases multiple results. All results assigned equal\nprobabilities are stored in probabilistic databases to facilitate uncertainty\nmanagement. The substantial degree of uncertainty diminishes the efficiency and\nreliability of data processing, thereby precluding the provision of more\naccurate information for decision-makers. To address this problem, we introduce\na new approach based on fine-grained correspondence verification with specific\nprompt of Large Language Model.\n Our approach is an iterative loop that consists of three main components: (1)\nthe correspondence selection algorithm, (2) correspondence verification, and\n(3) the update of probability distribution. The core idea is that\ncorrespondences intersect across multiple results, thereby linking the\nverification of correspondences to the reduction of uncertainty in candidate\nresults.\n The task of selecting an optimal correspondence set to maximize the\nanticipated uncertainty reduction within a fixed budgetary framework is\nestablished as an NP-hard problem. We propose a novel $(1-1/e)$-approximation\nalgorithm that significantly outperforms brute algorithm in terms of\ncomputational efficiency. To enhance correspondence verification, we have\ndeveloped two prompt templates that enable GPT-4 to achieve state-of-the-art\nperformance across two established benchmark datasets. Our comprehensive\nexperimental evaluation demonstrates the superior effectiveness and robustness\nof the proposed approach.\n","authors":["Longyu Feng","Huahang Li","Chen Jason Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.14507v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14153v2","updated":"2025-03-05T11:15:39Z","published":"2024-08-26T09:55:34Z","title":"Explaining Vision-Language Similarities in Dual Encoders with\n Feature-Pair Attributions","summary":" Dual encoder architectures like CLIP models map two types of inputs into a\nshared embedding space and predict similarities between them. Despite their\nsuccess, it is, however, not understood how these models compare their two\ninputs. Common first-order feature-attribution methods can only provide limited\ninsights into dual-encoders since their predictions depend on\nfeature-interactions rather than on individual features. In this paper, we\nfirst derive a second-order method enabling the attribution of predictions by\nany differentiable dual encoder onto feature-interactions between its inputs.\nSecond, we apply our method to CLIP models and show that they learn\nfine-grained correspondences between parts of captions and regions in images.\nThey match objects across input modes also account for mismatches. This\nvisual-linguistic grounding ability, however, varies heavily between object\nclasses and exhibits pronounced out-of-domain effects. We can identify\nindividual errors as well as systematic failure categories including object\ncoverage, unusual scenes and correlated contexts.\n","authors":["Lucas Möller","Pascal Tilli","Ngoc Thang Vu","Sebastian Padó"],"pdf_url":"https://arxiv.org/pdf/2408.14153v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03391v1","updated":"2025-03-05T11:12:40Z","published":"2025-03-05T11:12:40Z","title":"Multi-Agent DRL for Queue-Aware Task Offloading in Hierarchical\n MEC-Enabled Air-Ground Networks","summary":" Mobile edge computing (MEC)-enabled air-ground networks are a key component\nof 6G, employing aerial base stations (ABSs) such as unmanned aerial vehicles\n(UAVs) and high-altitude platform stations (HAPS) to provide dynamic services\nto ground IoT devices (IoTDs). These IoTDs support real-time applications\n(e.g., multimedia and Metaverse services) that demand high computational\nresources and strict quality of service (QoS) guarantees in terms of latency\nand task queue management. Given their limited energy and processing\ncapabilities, IoTDs rely on UAVs and HAPS to offload tasks for distributed\nprocessing, forming a multi-tier MEC system. This paper tackles the overall\nenergy minimization problem in MEC-enabled air-ground integrated networks\n(MAGIN) by jointly optimizing UAV trajectories, computing resource allocation,\nand queue-aware task offloading decisions. The optimization is challenging due\nto the nonconvex, nonlinear nature of this hierarchical system, which renders\ntraditional methods ineffective. We reformulate the problem as a multi-agent\nMarkov decision process (MDP) with continuous action spaces and heterogeneous\nagents, and propose a novel variant of multi-agent proximal policy optimization\nwith a Beta distribution (MAPPO-BD) to solve it. Extensive simulations show\nthat MAPPO-BD outperforms baseline schemes, achieving superior energy savings\nand efficient resource management in MAGIN while meeting queue delay and edge\ncomputing constraints.\n","authors":["Muhammet Hevesli","Abegaz Mohammed Seid","Aiman Erbad","Mohamed Abdallah"],"pdf_url":"https://arxiv.org/pdf/2503.03391v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.09453v2","updated":"2025-03-05T11:04:58Z","published":"2022-06-19T17:13:58Z","title":"Bounding Evidence and Estimating Log-Likelihood in VAE","summary":" Many crucial problems in deep learning and statistical inference are caused\nby a variational gap, i.e., a difference between model evidence\n(log-likelihood) and evidence lower bound (ELBO). In particular, in a classical\nVAE setting that involves training via an ELBO cost function, it is difficult\nto provide a robust comparison of the effects of training between models, since\nwe do not know a log-likelihood of data (but only its lower bound). In this\npaper, to deal with this problem, we introduce a general and effective upper\nbound, which allows us to efficiently approximate the evidence of data. We\nprovide extensive theoretical and experimental studies of our approach,\nincluding its comparison to the other state-of-the-art upper bounds, as well as\nits application as a tool for the evaluation of models that were trained on\nvarious lower bounds.\n","authors":["Łukasz Struski","Marcin Mazur","Paweł Batorski","Przemysław Spurek","Jacek Tabor"],"pdf_url":"https://arxiv.org/pdf/2206.09453v2.pdf","comment":"Paper accepted for AISTATS 2023"},{"id":"http://arxiv.org/abs/2502.15425v4","updated":"2025-03-05T10:48:42Z","published":"2025-02-21T12:52:16Z","title":"TAG: A Decentralized Framework for Multi-Agent Hierarchical\n Reinforcement Learning","summary":" Hierarchical organization is fundamental to biological systems and human\nsocieties, yet artificial intelligence systems often rely on monolithic\narchitectures that limit adaptability and scalability. Current hierarchical\nreinforcement learning (HRL) approaches typically restrict hierarchies to two\nlevels or require centralized training, which limits their practical\napplicability. We introduce TAME Agent Framework (TAG), a framework for\nconstructing fully decentralized hierarchical multi-agent systems. TAG enables\nhierarchies of arbitrary depth through a novel LevelEnv concept, which\nabstracts each hierarchy level as the environment for the agents above it. This\napproach standardizes information flow between levels while preserving loose\ncoupling, allowing for seamless integration of diverse agent types. We\ndemonstrate the effectiveness of TAG by implementing hierarchical architectures\nthat combine different RL agents across multiple levels, achieving improved\nperformance over classical multi-agent RL baselines on standard benchmarks. Our\nresults show that decentralized hierarchical organization enhances both\nlearning speed and final performance, positioning TAG as a promising direction\nfor scalable multi-agent systems.\n","authors":["Giuseppe Paolo","Abdelhakim Benechehab","Hamza Cherkaoui","Albert Thomas","Balázs Kégl"],"pdf_url":"https://arxiv.org/pdf/2502.15425v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.03565v2","updated":"2025-03-05T10:47:17Z","published":"2024-10-04T16:15:31Z","title":"Exploration Implies Data Augmentation: Reachability and Generalisation\n in Contextual MDPs","summary":" In the zero-shot policy transfer (ZSPT) setting for contextual Markov\ndecision processes (MDP), agents train on a fixed set of contexts and must\ngeneralise to new ones. Recent work has argued and demonstrated that increased\nexploration can improve this generalisation, by training on more states in the\ntraining contexts. In this paper, we demonstrate that training on more states\ncan indeed improve generalisation, but can come at a cost of reducing the\naccuracy of the learned value function which should not benefit generalisation.\nWe introduce reachability in the ZSPT setting to define which states/contexts\nrequire generalisation and explain why exploration can improve it. We\nhypothesise and demonstrate that using exploration to increase the agent's\ncoverage while also increasing the accuracy improves generalisation even more.\nInspired by this, we propose a method Explore-Go that implements an exploration\nphase at the beginning of each episode, which can be combined with existing on-\nand off-policy RL algorithms and significantly improves generalisation even in\npartially observable MDPs. We demonstrate the effectiveness of Explore-Go when\ncombined with several popular algorithms and show an increase in generalisation\nperformance across several environments. With this, we hope to provide\npractitioners with a simple modification that can improve the generalisation of\ntheir agents.\n","authors":["Max Weltevrede","Caroline Horsch","Matthijs T. J. Spaan","Wendelin Böhmer"],"pdf_url":"https://arxiv.org/pdf/2410.03565v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2406.08069"},{"id":"http://arxiv.org/abs/2503.03361v1","updated":"2025-03-05T10:40:19Z","published":"2025-03-05T10:40:19Z","title":"From Infants to AI: Incorporating Infant-like Learning in Models Boosts\n Efficiency and Generalization in Learning Social Prediction Tasks","summary":" Early in development, infants learn a range of useful concepts, which can be\nchallenging from a computational standpoint. This early learning comes together\nwith an initial understanding of aspects of the meaning of concepts, e.g.,\ntheir implications, causality, and using them to predict likely future events.\nAll this is accomplished in many cases with little or no supervision, and from\nrelatively few examples, compared with current network models. In learning\nabout objects and human-object interactions, early acquired and possibly innate\nconcepts are often used in the process of learning additional, more complex\nconcepts. In the current work, we model how early-acquired concepts are used in\nthe learning of subsequent concepts, and compare the results with standard deep\nnetwork modeling. We focused in particular on the use of the concepts of\nanimacy and goal attribution in learning to predict future events. We show that\nthe use of early concepts in the learning of new concepts leads to better\nlearning (higher accuracy) and more efficient learning (requiring less data).\nWe further show that this integration of early and new concepts shapes the\nrepresentation of the concepts acquired by the model. The results show that\nwhen the concepts were learned in a human-like manner, the emerging\nrepresentation was more useful, as measured in terms of generalization to novel\ndata and tasks. On a more general level, the results suggest that there are\nlikely to be basic differences in the conceptual structures acquired by current\nnetwork models compared to human learning.\n","authors":["Shify Treger","Shimon Ullman"],"pdf_url":"https://arxiv.org/pdf/2503.03361v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03360v1","updated":"2025-03-05T10:40:09Z","published":"2025-03-05T10:40:09Z","title":"Transformers for molecular property prediction: Domain adaptation\n efficiently improves performance","summary":" Most of the current transformer-based chemical language models are\npre-trained on millions to billions of molecules. However, the improvement from\nsuch scaling in dataset size is not confidently linked to improved molecular\nproperty prediction. The aim of this study is to investigate and overcome some\nof the limitations of transformer models in predicting molecular properties.\nSpecifically, we examine the impact of pre-training dataset size and diversity\non the performance of transformer models and investigate the use of domain\nadaptation as a technique for improving model performance. First, our findings\nindicate that increasing pretraining dataset size beyond 400K molecules from\nthe GuacaMol dataset does not result in a significant improvement on four ADME\nendpoints, namely, solubility, permeability, microsomal stability, and plasma\nprotein binding. Second, our results demonstrate that using domain adaptation\nby further training the transformer model on a small set of domain-relevant\nmolecules, i.e., a few hundred to a few thousand, using multi-task regression\nof physicochemical properties was sufficient to significantly improve\nperformance for three out of the four investigated ADME endpoints (P-value <\n0.001). Finally, we observe that a model pre-trained on 400K molecules and\ndomain adopted on a few hundred/thousand molecules performs similarly (P-value\n> 0.05) to more complicated transformer models like MolBERT(pre-trained on 1.3M\nmolecules) and MolFormer (pre-trained on 100M molecules). A comparison to a\nrandom forest model trained on basic physicochemical properties showed similar\nperformance to the examined transformer models. We believe that current\ntransformer models can be improved through further systematic analysis of\npre-training and downstream data, pre-training objectives, and scaling laws,\nultimately leading to better and more helpful models.\n","authors":["Afnan Sultan","Max Rausch-Dupont","Shahrukh Khan","Olga Kalinina","Andrea Volkamer","Dietrich Klakow"],"pdf_url":"https://arxiv.org/pdf/2503.03360v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03350v1","updated":"2025-03-05T10:22:49Z","published":"2025-03-05T10:22:49Z","title":"Leveraging Large Language Models to Develop Heuristics for Emerging\n Optimization Problems","summary":" Combinatorial optimization problems often rely on heuristic algorithms to\ngenerate efficient solutions. However, the manual design of heuristics is\nresource-intensive and constrained by the designer's expertise. Recent advances\nin artificial intelligence, particularly large language models (LLMs), have\ndemonstrated the potential to automate heuristic generation through\nevolutionary frameworks. Recent works focus only on well-known combinatorial\noptimization problems like the traveling salesman problem and online bin\npacking problem when designing constructive heuristics. This study investigates\nwhether LLMs can effectively generate heuristics for niche, not yet broadly\nresearched optimization problems, using the unit-load pre-marshalling problem\nas an example case. We propose the Contextual Evolution of Heuristics (CEoH)\nframework, an extension of the Evolution of Heuristics (EoH) framework, which\nincorporates problem-specific descriptions to enhance in-context learning\nduring heuristic generation. Through computational experiments, we evaluate\nCEoH and EoH and compare the results. Results indicate that CEoH enables\nsmaller LLMs to generate high-quality heuristics more consistently and even\noutperform larger models. Larger models demonstrate robust performance with or\nwithout contextualized prompts. The generated heuristics exhibit scalability to\ndiverse instance configurations.\n","authors":["Thomas Bömer","Nico Koltermann","Max Disselnmeyer","Laura Dörr","Anne Meyer"],"pdf_url":"https://arxiv.org/pdf/2503.03350v1.pdf","comment":"Under review LION19: The 19th Learning and Intelligent OptimizatioN\n Conference"},{"id":"http://arxiv.org/abs/2412.03076v2","updated":"2025-03-05T10:19:05Z","published":"2024-12-04T06:53:59Z","title":"Coordinated Multi-Armed Bandits for Improved Spatial Reuse in Wi-Fi","summary":" Multi-Access Point Coordination (MAPC) and Artificial Intelligence and\nMachine Learning (AI/ML) are expected to be key features in future Wi-Fi, such\nas the forthcoming IEEE 802.11bn (Wi-Fi~8) and beyond. In this paper, we\nexplore a coordinated solution based on online learning to drive the\noptimization of Spatial Reuse (SR), a method that allows multiple devices to\nperform simultaneous transmissions by controlling interference through Packet\nDetect (PD) adjustment and transmit power control. In particular, we focus on a\nMulti-Agent Multi-Armed Bandit (MA-MAB) setting, where multiple decision-making\nagents concurrently configure SR parameters from coexisting networks by\nleveraging the MAPC framework, and study various algorithms and reward-sharing\nmechanisms. We evaluate different MA-MAB implementations using Komondor, a\nwell-adopted Wi-Fi simulator, and demonstrate that AI-native SR enabled by\ncoordinated MABs can improve the network performance over current Wi-Fi\noperation: mean throughput increases by 15%, fairness is improved by increasing\nthe minimum throughput across the network by 210%, while the maximum access\ndelay is kept below 3 ms.\n","authors":["Francesc Wilhelmi","Boris Bellalta","Szymon Szott","Katarzyna Kosek-Szott","Sergio Barrachina-Muñoz"],"pdf_url":"https://arxiv.org/pdf/2412.03076v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03338v1","updated":"2025-03-05T10:12:22Z","published":"2025-03-05T10:12:22Z","title":"Navigating Intelligence: A Survey of Google OR-Tools and Machine\n Learning for Global Path Planning in Autonomous Vehicles","summary":" We offer a new in-depth investigation of global path planning (GPP) for\nunmanned ground vehicles, an autonomous mining sampling robot named ROMIE. GPP\nis essential for ROMIE's optimal performance, which is translated into solving\nthe traveling salesman problem, a complex graph theory challenge that is\ncrucial for determining the most effective route to cover all sampling\nlocations in a mining field. This problem is central to enhancing ROMIE's\noperational efficiency and competitiveness against human labor by optimizing\ncost and time. The primary aim of this research is to advance GPP by\ndeveloping, evaluating, and improving a cost-efficient software and web\napplication. We delve into an extensive comparison and analysis of Google\noperations research (OR)-Tools optimization algorithms. Our study is driven by\nthe goal of applying and testing the limits of OR-Tools capabilities by\nintegrating Reinforcement Learning techniques for the first time. This enables\nus to compare these methods with OR-Tools, assessing their computational\neffectiveness and real-world application efficiency. Our analysis seeks to\nprovide insights into the effectiveness and practical application of each\ntechnique. Our findings indicate that Q-Learning stands out as the optimal\nstrategy, demonstrating superior efficiency by deviating only 1.2% on average\nfrom the optimal solutions across our datasets.\n","authors":["Alexandre Benoit","Pedram Asef"],"pdf_url":"https://arxiv.org/pdf/2503.03338v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.07804v3","updated":"2025-03-05T10:09:25Z","published":"2024-12-09T09:04:02Z","title":"XLSTM-HVED: Cross-Modal Brain Tumor Segmentation and MRI Reconstruction\n Method Using Vision XLSTM and Heteromodal Variational Encoder-Decoder","summary":" Neurogliomas are among the most aggressive forms of cancer, presenting\nconsiderable challenges in both treatment and monitoring due to their\nunpredictable biological behavior. Magnetic resonance imaging (MRI) is\ncurrently the preferred method for diagnosing and monitoring gliomas. However,\nthe lack of specific imaging techniques often compromises the accuracy of tumor\nsegmentation during the imaging process. To address this issue, we introduce\nthe XLSTM-HVED model. This model integrates a hetero-modal encoder-decoder\nframework with the Vision XLSTM module to reconstruct missing MRI modalities.\nBy deeply fusing spatial and temporal features, it enhances tumor segmentation\nperformance. The key innovation of our approach is the Self-Attention\nVariational Encoder (SAVE) module, which improves the integration of modal\nfeatures. Additionally, it optimizes the interaction of features between\nsegmentation and reconstruction tasks through the Squeeze-Fusion-Excitation\nCross Awareness (SFECA) module. Our experiments using the BraTS 2024 dataset\ndemonstrate that our model significantly outperforms existing advanced methods\nin handling cases where modalities are missing. Our source code is available at\nhttps://github.com/Quanato607/XLSTM-HVED.\n","authors":["Shenghao Zhu","Yifei Chen","Shuo Jiang","Weihong Chen","Chang Liu","Yuanhan Wang","Xu Chen","Yifan Ke","Feiwei Qin","Changmiao Wang","Zhu Zhu"],"pdf_url":"https://arxiv.org/pdf/2412.07804v3.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2408.01072v2","updated":"2025-03-05T10:03:08Z","published":"2024-08-02T07:47:51Z","title":"A Survey on Self-play Methods in Reinforcement Learning","summary":" Self-play, characterized by agents' interactions with copies or past versions\nof themselves, has recently gained prominence in reinforcement learning (RL).\nThis paper first clarifies the preliminaries of self-play, including the\nmulti-agent reinforcement learning framework and basic game theory concepts.\nThen, it provides a unified framework and classifies existing self-play\nalgorithms within this framework. Moreover, the paper bridges the gap between\nthe algorithms and their practical implications by illustrating the role of\nself-play in different scenarios. Finally, the survey highlights open\nchallenges and future research directions in self-play. This paper is an\nessential guide map for understanding the multifaceted landscape of self-play\nin RL.\n","authors":["Ruize Zhang","Zelai Xu","Chengdong Ma","Chao Yu","Wei-Wei Tu","Wenhao Tang","Shiyu Huang","Deheng Ye","Wenbo Ding","Yaodong Yang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2408.01072v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03321v1","updated":"2025-03-05T09:55:07Z","published":"2025-03-05T09:55:07Z","title":"See What You Are Told: Visual Attention Sink in Large Multimodal Models","summary":" Large multimodal models (LMMs) \"see\" images by leveraging the attention\nmechanism between text and visual tokens in the transformer decoder. Ideally,\nthese models should focus on key visual information relevant to the text token.\nHowever, recent findings indicate that LMMs have an extraordinary tendency to\nconsistently allocate high attention weights to specific visual tokens, even\nwhen these tokens are irrelevant to the corresponding text. In this study, we\ninvestigate the property behind the appearance of these irrelevant visual\ntokens and examine their characteristics. Our findings show that this behavior\narises due to the massive activation of certain hidden state dimensions, which\nresembles the attention sink found in language models. Hence, we refer to this\nphenomenon as the visual attention sink. In particular, our analysis reveals\nthat removing the irrelevant visual sink tokens does not impact model\nperformance, despite receiving high attention weights. Consequently, we recycle\nthe attention to these tokens as surplus resources, redistributing the\nattention budget to enhance focus on the image. To achieve this, we introduce\nVisual Attention Redistribution (VAR), a method that redistributes attention in\nimage-centric heads, which we identify as innately focusing on visual\ninformation. VAR can be seamlessly applied across different LMMs to improve\nperformance on a wide range of tasks, including general vision-language tasks,\nvisual hallucination tasks, and vision-centric tasks, all without the need for\nadditional training, models, or inference steps. Experimental results\ndemonstrate that VAR enables LMMs to process visual information more\neffectively by adjusting their internal attention mechanisms, offering a new\ndirection to enhancing the multimodal capabilities of LMMs.\n","authors":["Seil Kang","Jinyeong Kim","Junhyeok Kim","Seong Jae Hwang"],"pdf_url":"https://arxiv.org/pdf/2503.03321v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.13149v3","updated":"2025-03-05T09:54:25Z","published":"2025-02-11T13:16:31Z","title":"Bi-Fact: A Bidirectional Factorization-based Evaluation of Intent\n Extraction from UI Trajectories","summary":" Evaluating intent extraction from GUIs demands accurate, fine-grained\nmetrics. This paper introduces Bi-Fact, a novel method that decomposes intents\ninto atomic facts and performs bidirectional comparisons to assess precision\nand recall. Experiments demonstrate Bi-Fact's superior correlation with human\njudgments compared to existing metrics, establishing a more robust evaluation\nframework for UI-driven intent understanding.\n","authors":["Sapir Caduri","Anatoly Efros","Noam Kahlon","Danielle Cohen","Yoni Halpern","Ido Dagan"],"pdf_url":"https://arxiv.org/pdf/2502.13149v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03766v3","updated":"2025-03-05T09:52:30Z","published":"2024-11-06T08:59:44Z","title":"Number Cookbook: Number Understanding of Language Models and How to\n Improve It","summary":" Large language models (LLMs) can solve an increasing number of complex\nreasoning tasks while making surprising mistakes in basic numerical\nunderstanding and processing (such as 9.11 > 9.9). The latter ability is\nessential for tackling complex arithmetic and mathematical problems and serves\nas a foundation for most reasoning tasks, but previous work paid little\nattention to it or only discussed several restricted tasks (like integer\naddition). In this paper, we comprehensively investigate the numerical\nunderstanding and processing ability (NUPA) of LLMs. Firstly, we introduce a\nbenchmark covering four common numerical representations and 17 distinct\nnumerical tasks in four major categories, resulting in 41 meaningful\ncombinations in total. These tasks are derived from primary and secondary\neducation curricula, encompassing nearly all everyday numerical understanding\nand processing scenarios, and the rules of these tasks are very simple and\nclear. Through the benchmark, we find that current LLMs fail frequently in many\nof the tasks. To study the problem, we train small models with existing and\npotential techniques for enhancing NUPA (such as tokenizers, PEs, and number\nformats), comprehensively evaluating their effectiveness using our testbed. We\nalso finetune practical-scale LLMs on our proposed NUPA tasks and find that 1)\nnaive finetuning can improve NUPA a lot on many but not all tasks, and 2)\nsurprisingly, techniques designed to enhance NUPA prove ineffective for\nfinetuning pretrained models. We further explore the impact of chain-of-thought\ntechniques on NUPA. Our work provides a more detailed and comprehensive\nunderstanding of NUPA in LLMs. Our benchmark and code are released at\nhttps://github.com/GraphPKU/number_cookbook.\n","authors":["Haotong Yang","Yi Hu","Shijia Kang","Zhouchen Lin","Muhan Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.03766v3.pdf","comment":"ICLR 2025 poster"},{"id":"http://arxiv.org/abs/2501.05891v2","updated":"2025-03-05T09:18:31Z","published":"2025-01-10T11:44:35Z","title":"Affordably Fine-tuned LLMs Provide Better Answers to Course-specific\n MCQs","summary":" In education, the capability of generating human-like text of Large Language\nModels (LLMs) inspired work on how they can increase the efficiency of learning\nand teaching. We study the affordability of these models for educators and\nstudents by investigating how LLMs answer multiple-choice questions (MCQs) with\nrespect to hardware constraints and refinement techniques. We explore this\nspace by using generic pre-trained LLMs (the 7B, 13B, and 70B variants of\nLLaMA-2) to answer 162 undergraduate-level MCQs from a course on Programming\nLanguages (PL) -- the MCQ dataset is a contribution of this work, which we make\npublicly available. Specifically, we dissect how different factors, such as\nusing readily-available material -- (parts of) the course's textbook -- for\nfine-tuning and quantisation (to decrease resource usage) can change the\naccuracy of the responses. The main takeaway is that smaller textbook-based\nfine-tuned models outperform generic larger ones (whose pre-training requires\nconspicuous resources), making the usage of LLMs for answering MCQs resource-\nand material-wise affordable.\n","authors":["Bianca Raimondi","Saverio Giallorenzo","Maurizio Gabbrielli"],"pdf_url":"https://arxiv.org/pdf/2501.05891v2.pdf","comment":"The 40th ACM/SIGAPP Symposium On Applied Computing"},{"id":"http://arxiv.org/abs/2503.02368v2","updated":"2025-03-05T09:12:25Z","published":"2025-03-04T07:49:10Z","title":"Iterative Value Function Optimization for Guided Decoding","summary":" While Reinforcement Learning from Human Feedback (RLHF) has become the\npredominant method for controlling language model outputs, it suffers from high\ncomputational costs and training instability. Guided decoding, especially\nvalue-guided methods, offers a cost-effective alternative by controlling\noutputs without re-training models. However, the accuracy of the value function\nis crucial for value-guided decoding, as inaccuracies can lead to suboptimal\ndecision-making and degraded performance. Existing methods struggle with\naccurately estimating the optimal value function, leading to less effective\ncontrol. We propose Iterative Value Function Optimization, a novel framework\nthat addresses these limitations through two key components: Monte Carlo Value\nEstimation, which reduces estimation variance by exploring diverse\ntrajectories, and Iterative On-Policy Optimization, which progressively\nimproves value estimation through collecting trajectories from value-guided\npolicies. Extensive experiments on text summarization, multi-turn dialogue, and\ninstruction following demonstrate the effectiveness of value-guided decoding\napproaches in aligning language models. These approaches not only achieve\nalignment but also significantly reduce computational costs by leveraging\nprincipled value function optimization for efficient and effective control.\n","authors":["Zhenhua Liu","Lijun Li","Ruizhe Chen","Yuxian Jiang","Tong Zhu","Zhaochen Su","Wenliang Chen","Jing Shao"],"pdf_url":"https://arxiv.org/pdf/2503.02368v2.pdf","comment":"20 pages, 10 figures"},{"id":"http://arxiv.org/abs/2503.03283v1","updated":"2025-03-05T09:09:01Z","published":"2025-03-05T09:09:01Z","title":"Exploring specialization and sensitivity of convolutional neural\n networks in the context of simultaneous image augmentations","summary":" Drawing parallels with the way biological networks are studied, we adapt the\ntreatment--control paradigm to explainable artificial intelligence research and\nenrich it through multi-parametric input alterations. In this study, we propose\na framework for investigating the internal inference impacted by input data\naugmentations. The internal changes in network operation are reflected in\nactivation changes measured by variance, which can be decomposed into\ncomponents related to each augmentation, employing Sobol indices and Shapley\nvalues. These quantities enable one to visualize sensitivity to different\nvariables and use them for guided masking of activations. In addition, we\nintroduce a way of single-class sensitivity analysis where the candidates are\nfiltered according to their matching to prediction bias generated by targeted\ndamaging of the activations. Relying on the observed parallels, we assume that\nthe developed framework can potentially be transferred to studying biological\nneural networks in complex environments.\n","authors":["Pavel Kharyuk","Sergey Matveev","Ivan Oseledets"],"pdf_url":"https://arxiv.org/pdf/2503.03283v1.pdf","comment":"26 pages; main text: 5 figures, 4 tables; appendix: 4 sections, 3\n tables; supplementary: 7 files (figures S1-S6: packed as 7z archive, S7:\n single pdf file)"},{"id":"http://arxiv.org/abs/2412.12843v2","updated":"2025-03-05T09:03:18Z","published":"2024-12-17T12:11:04Z","title":"SLTNet: Efficient Event-based Semantic Segmentation with Spike-driven\n Lightweight Transformer-based Networks","summary":" Event-based semantic segmentation has great potential in autonomous driving\nand robotics due to the advantages of event cameras, such as high dynamic\nrange, low latency, and low power cost. Unfortunately, current artificial\nneural network (ANN)-based segmentation methods suffer from high computational\ndemands, the requirements for image frames, and massive energy consumption,\nlimiting their efficiency and application on resource-constrained edge/mobile\nplatforms. To address these problems, we introduce SLTNet, a spike-driven\nlightweight transformer-based network designed for event-based semantic\nsegmentation. Specifically, SLTNet is built on efficient spike-driven\nconvolution blocks (SCBs) to extract rich semantic features while reducing the\nmodel's parameters. Then, to enhance the long-range contextural feature\ninteraction, we propose novel spike-driven transformer blocks (STBs) with\nbinary mask operations. Based on these basic blocks, SLTNet employs a\nhigh-efficiency single-branch architecture while maintaining the low energy\nconsumption of the Spiking Neural Network (SNN). Finally, extensive experiments\non DDD17 and DSEC-Semantic datasets demonstrate that SLTNet outperforms\nstate-of-the-art (SOTA) SNN-based methods by at most 9.06% and 9.39% mIoU,\nrespectively, with extremely 4.58x lower energy consumption and 114 FPS\ninference speed. Our code is open-sourced and available at\nhttps://github.com/longxianlei/SLTNet-v1.0.\n","authors":["Xiaxin Zhu","Fangming Guo","Xianlei Long","Qingyi Gu","Chao Chen","Fuqiang Gu"],"pdf_url":"https://arxiv.org/pdf/2412.12843v2.pdf","comment":"Submitted to 2025 IEEE/RSJ International Conference on Intelligent\n Robots and Systems (IROS 2025)"},{"id":"http://arxiv.org/abs/2503.03274v1","updated":"2025-03-05T08:56:26Z","published":"2025-03-05T08:56:26Z","title":"Benchmarking Dynamic SLO Compliance in Distributed Computing Continuum\n Systems","summary":" Ensuring Service Level Objectives (SLOs) in large-scale architectures, such\nas Distributed Computing Continuum Systems (DCCS), is challenging due to their\nheterogeneous nature and varying service requirements across different devices\nand applications. Additionally, unpredictable workloads and resource\nlimitations lead to fluctuating performance and violated SLOs. To improve SLO\ncompliance in DCCS, one possibility is to apply machine learning; however, the\ndesign choices are often left to the developer. To that extent, we provide a\nbenchmark of Active Inference -- an emerging method from neuroscience --\nagainst three established reinforcement learning algorithms (Deep Q-Network,\nAdvantage Actor-Critic, and Proximal Policy Optimization). We consider a\nrealistic DCCS use case: an edge device running a video conferencing\napplication alongside a WebSocket server streaming videos. Using one of the\nrespective algorithms, we continuously monitor key performance metrics, such as\nlatency and bandwidth usage, to dynamically adjust parameters -- including the\nnumber of streams, frame rate, and resolution -- to optimize service quality\nand user experience. To test algorithms' adaptability to constant system\nchanges, we simulate dynamically changing SLOs and both instant and gradual\ndata-shift scenarios, such as network bandwidth limitations and fluctuating\ndevice thermal states. Although the evaluated algorithms all showed advantages\nand limitations, our findings demonstrate that Active Inference is a promising\napproach for ensuring SLO compliance in DCCS, offering lower memory usage,\nstable CPU utilization, and fast convergence.\n","authors":["Alfreds Lapkovskis","Boris Sedlak","Sindri Magnússon","Schahram Dustdar","Praveen Kumar Donta"],"pdf_url":"https://arxiv.org/pdf/2503.03274v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03269v1","updated":"2025-03-05T08:50:53Z","published":"2025-03-05T08:50:53Z","title":"Conformal Transformations for Symmetric Power Transformers","summary":" Transformers with linear attention offer significant computational advantages\nover softmax-based transformers but often suffer from degraded performance. The\nsymmetric power (sympow) transformer, a particular type of linear transformer,\naddresses some of this performance gap by leveraging symmetric tensor\nembeddings, achieving comparable performance to softmax transformers. However,\nthe finite capacity of the recurrent state in sympow transformers limits their\nability to retain information, leading to performance degradation when scaling\nthe training or evaluation context length. To address this issue, we propose\nthe conformal-sympow transformer, which dynamically frees up capacity using\ndata-dependent multiplicative gating and adaptively stores information using\ndata-dependent rotary embeddings. Preliminary experiments on the LongCrawl64\ndataset demonstrate that conformal-sympow overcomes the limitations of sympow\ntransformers, achieving robust performance across scaled training and\nevaluation contexts.\n","authors":["Saurabh Kumar","Jacob Buckman","Carles Gelada","Sean Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.03269v1.pdf","comment":"SCOPE Workshop at ICLR 2025"},{"id":"http://arxiv.org/abs/2502.09977v2","updated":"2025-03-05T08:48:25Z","published":"2025-02-14T08:04:22Z","title":"LaRA: Benchmarking Retrieval-Augmented Generation and Long-Context LLMs\n -- No Silver Bullet for LC or RAG Routing","summary":" Effectively incorporating external knowledge into Large Language Models\n(LLMs) is crucial for enhancing their capabilities and addressing real-world\nneeds. Retrieval-Augmented Generation (RAG) offers an effective method for\nachieving this by retrieving the most relevant fragments into LLMs. However,\nthe advancements in context window size for LLMs offer an alternative approach,\nraising the question of whether RAG remains necessary for effectively handling\nexternal knowledge. Several existing studies provide inconclusive comparisons\nbetween RAG and long-context (LC) LLMs, largely due to limitations in the\nbenchmark designs. In this paper, we present LaRA, a novel benchmark\nspecifically designed to rigorously compare RAG and LC LLMs. LaRA encompasses\n2326 test cases across four practical QA task categories and three types of\nnaturally occurring long texts. Through systematic evaluation of seven\nopen-source and four proprietary LLMs, we find that the optimal choice between\nRAG and LC depends on a complex interplay of factors, including the model's\nparameter size, long-text capabilities, context length, task type, and the\ncharacteristics of the retrieved chunks. Our findings provide actionable\nguidelines for practitioners to effectively leverage both RAG and LC approaches\nin developing and deploying LLM applications. Our code and dataset is provided\nat:\n\\href{https://github.com/Alibaba-NLP/LaRA}{\\textbf{https://github.com/Alibaba-NLP/LaRA}}.\n","authors":["Kuan Li","Liwen Zhang","Yong Jiang","Pengjun Xie","Fei Huang","Shuai Wang","Minhao Cheng"],"pdf_url":"https://arxiv.org/pdf/2502.09977v2.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2503.03262v1","updated":"2025-03-05T08:38:51Z","published":"2025-03-05T08:38:51Z","title":"Trajectory Prediction for Autonomous Driving: Progress, Limitations, and\n Future Directions","summary":" As the potential for autonomous vehicles to be integrated on a large scale\ninto modern traffic systems continues to grow, ensuring safe navigation in\ndynamic environments is crucial for smooth integration. To guarantee safety and\nprevent collisions, autonomous vehicles must be capable of accurately\npredicting the trajectories of surrounding traffic agents. Over the past\ndecade, significant efforts from both academia and industry have been dedicated\nto designing solutions for precise trajectory forecasting. These efforts have\nproduced a diverse range of approaches, raising questions about the differences\nbetween these methods and whether trajectory prediction challenges have been\nfully addressed. This paper reviews a substantial portion of recent trajectory\nprediction methods and devises a taxonomy to classify existing solutions. A\ngeneral overview of the prediction pipeline is also provided, covering input\nand output modalities, modeling features, and prediction paradigms discussed in\nthe literature. In addition, the paper discusses active research areas within\ntrajectory prediction, addresses the posed research questions, and highlights\nthe remaining research gaps and challenges.\n","authors":["Nadya Abdel Madjid","Abdulrahman Ahmad","Murad Mebrahtu","Yousef Babaa","Abdelmoamen Nasser","Sumbal Malik","Bilal Hassan","Naoufel Werghi","Jorge Dias","Majid Khonji"],"pdf_url":"https://arxiv.org/pdf/2503.03262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05274v2","updated":"2025-03-05T08:36:27Z","published":"2024-09-17T10:08:37Z","title":"Scale-Invariant Object Detection by Adaptive Convolution with Unified\n Global-Local Context","summary":" Dense features are important for detecting minute objects in images.\nUnfortunately, despite the remarkable efficacy of the CNN models in multi-scale\nobject detection, CNN models often fail to detect smaller objects in images due\nto the loss of dense features during the pooling process. Atrous convolution\naddresses this issue by applying sparse kernels. However, sparse kernels often\ncan lose the multi-scale detection efficacy of the CNN model. In this paper, we\npropose an object detection model using a Switchable (adaptive) Atrous\nConvolutional Network (SAC-Net) based on the efficientDet model. A fixed atrous\nrate limits the performance of the CNN models in the convolutional layers. To\novercome this limitation, we introduce a switchable mechanism that allows for\ndynamically adjusting the atrous rate during the forward pass. The proposed\nSAC-Net encapsulates the benefits of both low-level and high-level features to\nachieve improved performance on multi-scale object detection tasks, without\nlosing the dense features. Further, we apply a depth-wise switchable atrous\nrate to the proposed network, to improve the scale-invariant features. Finally,\nwe apply global context on the proposed model. Our extensive experiments on\nbenchmark datasets demonstrate that the proposed SAC-Net outperforms the\nstate-of-the-art models by a significant margin in terms of accuracy.\n","authors":["Amrita Singh","Snehasis Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2410.05274v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03258v1","updated":"2025-03-05T08:28:11Z","published":"2025-03-05T08:28:11Z","title":"Exploring the Potential of Large Language Models as Predictors in\n Dynamic Text-Attributed Graphs","summary":" With the rise of large language models (LLMs), there has been growing\ninterest in Graph Foundation Models (GFMs) for graph-based tasks. By leveraging\nLLMs as predictors, GFMs have demonstrated impressive generalizability across\nvarious tasks and datasets. However, existing research on LLMs as predictors\nhas predominantly focused on static graphs, leaving their potential in dynamic\ngraph prediction unexplored. In this work, we pioneer using LLMs for predictive\ntasks on dynamic graphs. We identify two key challenges: the constraints\nimposed by context length when processing large-scale historical data and the\nsignificant variability in domain characteristics, both of which complicate the\ndevelopment of a unified predictor. To address these challenges, we propose the\nGraphAgent-Dynamic (GAD) Framework, a multi-agent system that leverages\ncollaborative LLMs. In contrast to using a single LLM as the predictor, GAD\nincorporates global and local summary agents to generate domain-specific\nknowledge, enhancing its transferability across domains. Additionally,\nknowledge reflection agents enable adaptive updates to GAD's knowledge,\nmaintaining a unified and self-consistent architecture. In experiments, GAD\ndemonstrates performance comparable to or even exceeds that of full-supervised\ngraph neural networks without dataset-specific training. Finally, to enhance\nthe task-specific performance of LLM-based predictors, we discuss potential\nimprovements, such as dataset-specific fine-tuning to LLMs. By developing\ntailored strategies for different tasks, we provide new insights for the future\ndesign of LLM-based predictors.\n","authors":["Runlin Lei","Jiarui Ji","Haipeng Ding","Lu Yi","Zhewei Wei","Yongchao Liu","Chuntao Hong"],"pdf_url":"https://arxiv.org/pdf/2503.03258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01669v2","updated":"2025-03-05T08:23:02Z","published":"2024-01-16T13:41:00Z","title":"Improved Performances and Motivation in Intelligent Tutoring Systems:\n Combining Machine Learning and Learner Choice","summary":" Large class sizes challenge personalized learning in schools, prompting the\nuse of educational technologies such as intelligent tutoring systems. To\naddress this, we present an AI-driven personalization system, called ZPDES,\nbased on the Learning Progress Hypothesis - modeling curiosity-driven learning\n- and multi-armed bandit techniques. It sequences exercises that maximize\nlearning progress for each student. While previous studies demonstrated its\nefficacy in enhancing learning compared to hand-made curricula, its impact on\nstudent motivation remained unexplored. Furthermore, ZPDES previously lacked\nfeatures allowing student choice, a limitation in agency that conflicts with\nits foundation on models of curiosity-driven learning. This study investigates\nhow integrating choice, as a gamification element unrelated to exercise\ndifficulty, affects both learning outcomes and motivation. We conducted an\nextensive field study (265 7-8 years old children, RCT design), comparing ZPDES\nwith and without choice against a hand-designed curriculum. Results show that\nZPDES improves both learning performance and the learning experience. Moreover\nadding choice to ZPDES enhances intrinsic motivation and further strengthens\nits learning benefits. In contrast, incorporating choice into a fixed, linear\ncurriculum negatively impacts learning outcomes. These findings highlight that\nthe intrinsic motivation elicited by choice (gamification) is beneficial only\nwhen paired with an adaptive personalized learning system. This insight is\ncritical as gamified features become increasingly prevalent in educational\ntechnologies.\n","authors":["Benjamin Clément","Hélène Sauzéon","Didier Roy","Pierre-Yves Oudeyer"],"pdf_url":"https://arxiv.org/pdf/2402.01669v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03245v1","updated":"2025-03-05T07:53:39Z","published":"2025-03-05T07:53:39Z","title":"Less is more? Rewards in RL for Cyber Defence","summary":" The last few years has seen an explosion of interest in autonomous cyber\ndefence agents based on deep reinforcement learning. Such agents are typically\ntrained in a cyber gym environment, also known as a cyber simulator, at least\n32 of which have already been built. Most, if not all cyber gyms provide dense\n\"scaffolded\" reward functions which combine many penalties or incentives for a\nrange of (un)desirable states and costly actions. Whilst dense rewards help\nalleviate the challenge of exploring complex environments, yielding seemingly\neffective strategies from relatively few environment steps; they are also known\nto bias the solutions an agent can find, potentially towards suboptimal\nsolutions. Sparse rewards could offer preferable or more effective solutions\nand have been overlooked by cyber gyms to date. In this work we set out to\nevaluate whether sparse reward functions might enable training more effective\ncyber defence agents. Towards this goal we first break down several evaluation\nlimitations in existing work by proposing a ground truth evaluation score that\ngoes beyond the standard RL paradigm used to train and evaluate agents. By\nadapting a well-established cyber gym to accommodate our methodology and ground\ntruth score, we propose and evaluate two sparse reward mechanisms and compare\nthem with a typical dense reward. Our evaluation considers a range of network\nsizes, from 2 to 50 nodes, and both reactive and proactive defensive actions.\nOur results show that sparse rewards, particularly positive reinforcement for\nan uncompromised network state, enable the training of more effective cyber\ndefence agents. Furthermore, we show that sparse rewards provide more stable\ntraining than dense rewards, and that both effectiveness and training stability\nare robust to a variety of cyber environment considerations.\n","authors":["Elizabeth Bates","Chris Hicks","Vasilios Mavroudis"],"pdf_url":"https://arxiv.org/pdf/2503.03245v1.pdf","comment":"4 Pages"},{"id":"http://arxiv.org/abs/2503.03238v1","updated":"2025-03-05T07:34:53Z","published":"2025-03-05T07:34:53Z","title":"FANS -- Formal Answer Selection for Natural Language Math Reasoning\n Using Lean4","summary":" Large Language Models (LLMs) have displayed astonishing abilities in various\ntasks, especially in text generation, classification, question answering, etc.\nHowever, the reasoning ability of LLMs still faces many debates. The inherent\nambiguity of Natural Language (NL) limits LLMs' ability to perform verifiable\nreasoning, making its answers lack coherence and trustworthy support. To tackle\nthe above problems, we propose a novel framework named FANS: Formal ANswer\nSelection for Natural Language Math Reasoning Using Lean4. To the best of our\nknowledge, it is the first framework that utilizes Lean4 to enhance LLMs' NL\nmath reasoning ability. In particular, given an NL math question and\nLLM-generated answers, FANS first translates it into Lean4 theorem statements.\nThen it tries to prove it using a Lean4 prover and verify it by Lean4. Finally,\nit uses the FL result to assist in answer selection. It enhances LLMs' NL math\nability in providing a computer-verifiable solution for its correct answer and\nproposes an alternative method for answer selection beyond the reward model.\nExtensive experiments indicate the effectiveness of our framework. It can\nimprove the accuracy rate of reward model enhanced LLMs in the MATH-500 dataset\nby at most 1.91% and AMC-23 by at most 8.33% on strong reward-model baselines.\nIn some particular fields like number theory that Lean4 experts in, we can even\nselect all correct solutions. The qualitative analysis also shows our framework\ncan make NL results formally backed by Lean4 proofs. As a pioneering work in\nthe corresponding field, we will open-source all our models and datasets to\nfurther boost the development of the field.\n","authors":["Jiarui Yao","Ruida Wang","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.03238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.17107v3","updated":"2025-03-05T07:29:42Z","published":"2024-12-22T17:39:32Z","title":"Grams: Gradient Descent with Adaptive Momentum Scaling","summary":" We introduce $\\mathbf{G}$radient Descent with $\\mathbf{A}$daptive\n$\\mathbf{M}$omentum $\\mathbf{S}$caling ($\\mathbf{Grams}$), a novel optimization\nalgorithm that decouples the direction and magnitude of parameter updates in\ndeep learning. Unlike traditional optimizers that directly integrate momentum\ninto updates, Grams separates the update direction, derived from current\ngradients, from momentum, which is used solely for adaptive magnitude scaling.\nThis approach enables Grams to achieve improved loss descent compared to\nstate-of-the-art cautious and momentum-based optimizers. We theoretically\ndemonstrate that Grams descents faster than other state-of-the-art optimizers\nand establish a global convergence guarantee for Grams. We also validate its\neffectiveness through extensive empirical evaluations. The results demonstrate\nGrams' superior performance, including faster convergence and better\ngeneralization, compared to widely-used optimizers such as Adam, Lion, and\ntheir cautious variants. Our results highlight Grams' potential as a\ntransformative approach for efficiently training and fine-tuning large language\nmodels. Code is available at https://github.com/Gunale0926/Grams.\n","authors":["Yang Cao","Xiaoyu Li","Zhao Song"],"pdf_url":"https://arxiv.org/pdf/2412.17107v3.pdf","comment":"SCOPE Workshop @ ICLR 2025"},{"id":"http://arxiv.org/abs/2412.09601v2","updated":"2025-03-05T07:06:15Z","published":"2024-12-12T18:59:11Z","title":"TimeRefine: Temporal Grounding with Time Refining Video LLM","summary":" Video temporal grounding aims to localize relevant temporal boundaries in a\nvideo given a textual prompt. Recent work has focused on enabling Video LLMs to\nperform video temporal grounding via next-token prediction of temporal\ntimestamps. However, accurately localizing timestamps in videos remains\nchallenging for Video LLMs when relying solely on temporal token prediction.\nOur proposed TimeRefine addresses this challenge in two ways. First, instead of\ndirectly predicting the start and end timestamps, we reformulate the temporal\ngrounding task as a temporal refining task: the model first makes rough\npredictions and then refines them by predicting offsets to the target segment.\nThis refining process is repeated multiple times, through which the model\nprogressively self-improves its temporal localization accuracy. Second, to\nenhance the model's temporal perception capabilities, we incorporate an\nauxiliary prediction head that penalizes the model more if a predicted segment\ndeviates further from the ground truth, thus encouraging the model to make\ncloser and more accurate predictions. Our plug-and-play method can be\nintegrated into most LLM-based temporal grounding approaches. The experimental\nresults demonstrate that TimeRefine achieves 3.6% and 5.0% mIoU improvements on\nthe ActivityNet and Charades-STA datasets, respectively. Code and pretrained\nmodels will be released.\n","authors":["Xizi Wang","Feng Cheng","Ziyang Wang","Huiyu Wang","Md Mohaiminul Islam","Lorenzo Torresani","Mohit Bansal","Gedas Bertasius","David Crandall"],"pdf_url":"https://arxiv.org/pdf/2412.09601v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01214v3","updated":"2025-03-05T07:02:28Z","published":"2024-07-01T11:59:59Z","title":"Revisiting Random Walks for Learning on Graphs","summary":" We revisit a simple model class for machine learning on graphs, where a\nrandom walk on a graph produces a machine-readable record, and this record is\nprocessed by a deep neural network to directly make vertex-level or graph-level\npredictions. We call these stochastic machines random walk neural networks\n(RWNNs), and through principled analysis, show that we can design them to be\nisomorphism invariant while capable of universal approximation of graph\nfunctions in probability. A useful finding is that almost any kind of record of\nrandom walks guarantees probabilistic invariance as long as the vertices are\nanonymized. This enables us, for example, to record random walks in plain text\nand adopt a language model to read these text records to solve graph tasks. We\nfurther establish a parallelism to message passing neural networks using tools\nfrom Markov chain theory, and show that over-smoothing in message passing is\nalleviated by construction in RWNNs, while over-squashing manifests as\nprobabilistic under-reaching. We empirically demonstrate RWNNs on a range of\nproblems, verifying our theoretical analysis and demonstrating the use of\nlanguage models for separating strongly regular graphs where 3-WL test fails,\nand transductive classification on arXiv citation network. Code is available at\nhttps://github.com/jw9730/random-walk.\n","authors":["Jinwoo Kim","Olga Zaghen","Ayhan Suleymanzade","Youngmin Ryou","Seunghoon Hong"],"pdf_url":"https://arxiv.org/pdf/2407.01214v3.pdf","comment":"51 pages, 14 figures"},{"id":"http://arxiv.org/abs/2502.17543v2","updated":"2025-03-05T06:53:52Z","published":"2025-02-24T18:56:58Z","title":"Training a Generally Curious Agent","summary":" Efficient exploration is essential for intelligent systems interacting with\ntheir environment, but existing language models often fall short in scenarios\nthat require strategic information gathering. In this paper, we present\nPAPRIKA, a fine-tuning approach that enables language models to develop general\ndecision-making capabilities that are not confined to particular environments.\nBy training on synthetic interaction data from different tasks that require\ndiverse strategies, PAPRIKA teaches models to explore and adapt their behavior\non a new task based on environment feedback in-context without more gradient\nupdates. Experimental results show that models fine-tuned with PAPRIKA can\neffectively transfer their learned decision-making capabilities to entirely\nunseen tasks without additional training. Unlike traditional training, our\napproach's primary bottleneck lies in sampling useful interaction data instead\nof model updates. To improve sample efficiency, we propose a curriculum\nlearning strategy that prioritizes sampling trajectories from tasks with high\nlearning potential. These results suggest a promising path towards AI systems\nthat can autonomously solve novel sequential decision-making problems that\nrequire interactions with the external world.\n","authors":["Fahim Tajwar","Yiding Jiang","Abitha Thankaraj","Sumaita Sadia Rahman","J Zico Kolter","Jeff Schneider","Ruslan Salakhutdinov"],"pdf_url":"https://arxiv.org/pdf/2502.17543v2.pdf","comment":"Project Website: https://paprika-llm.github.io"},{"id":"http://arxiv.org/abs/2407.10341v5","updated":"2025-03-05T06:53:17Z","published":"2024-07-14T21:41:29Z","title":"Affordance-Guided Reinforcement Learning via Visual Prompting","summary":" Robots equipped with reinforcement learning (RL) have the potential to learn\na wide range of skills solely from a reward signal. However, obtaining a robust\nand dense reward signal for general manipulation tasks remains a challenge.\nExisting learning-based approaches require significant data, such as human\ndemonstrations of success and failure, to learn task-specific reward functions.\nRecently, there is also a growing adoption of large multi-modal foundation\nmodels for robotics that can perform visual reasoning in physical contexts and\ngenerate coarse robot motions for manipulation tasks. Motivated by this range\nof capability, in this work, we present Keypoint-based Affordance Guidance for\nImprovements (KAGI), a method leveraging rewards shaped by vision-language\nmodels (VLMs) for autonomous RL. State-of-the-art VLMs have demonstrated\nimpressive reasoning about affordances through keypoints in zero-shot, and we\nuse these to define dense rewards that guide autonomous robotic learning. On\nreal-world manipulation tasks specified by natural language descriptions, KAGI\nimproves the sample efficiency of autonomous RL and enables successful task\ncompletion in 30K online fine-tuning steps. Additionally, we demonstrate the\nrobustness of KAGI to reductions in the number of in-domain demonstrations used\nfor pre-training, reaching similar performance in 45K online fine-tuning steps.\nProject website: https://sites.google.com/view/affordance-guided-rl\n","authors":["Olivia Y. Lee","Annie Xie","Kuan Fang","Karl Pertsch","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2407.10341v5.pdf","comment":"8 pages, 6 figures. Robotics: Science and Systems (RSS) 2024, Task\n Specification for General-Purpose Intelligent Robots & Lifelong Robot\n Learning Workshops"},{"id":"http://arxiv.org/abs/2408.08927v2","updated":"2025-03-05T06:23:52Z","published":"2024-08-15T20:06:06Z","title":"VerilogCoder: Autonomous Verilog Coding Agents with Graph-based Planning\n and Abstract Syntax Tree (AST)-based Waveform Tracing Tool","summary":" Due to the growing complexity of modern Integrated Circuits (ICs), automating\nhardware design can prevent a significant amount of human error from the\nengineering process and result in less errors. Verilog is a popular hardware\ndescription language for designing and modeling digital systems; thus, Verilog\ngeneration is one of the emerging areas of research to facilitate the design\nprocess. In this work, we propose VerilogCoder, a system of multiple Artificial\nIntelligence (AI) agents for Verilog code generation, to autonomously write\nVerilog code and fix syntax and functional errors using collaborative Verilog\ntools (i.e., syntax checker, simulator, and waveform tracer). Firstly, we\npropose a task planner that utilizes a novel Task and Circuit Relation Graph\nretrieval method to construct a holistic plan based on module descriptions. To\ndebug and fix functional errors, we develop a novel and efficient abstract\nsyntax tree (AST)-based waveform tracing tool, which is integrated within the\nautonomous Verilog completion flow. The proposed methodology successfully\ngenerates 94.2% syntactically and functionally correct Verilog code, surpassing\nthe state-of-the-art methods by 33.9% on the VerilogEval-Human v2 benchmark.\n","authors":["Chia-Tung Ho","Haoxing Ren","Brucek Khailany"],"pdf_url":"https://arxiv.org/pdf/2408.08927v2.pdf","comment":"main paper 7 pages, reference 1 page, it is the version that accepted\n by AAAI 2025"},{"id":"http://arxiv.org/abs/2502.16802v2","updated":"2025-03-05T06:23:22Z","published":"2025-02-24T03:25:56Z","title":"Unsupervised Topic Models are Data Mixers for Pre-training Language\n Models","summary":" The performance of large language models (LLMs) is significantly affected by\nthe quality and composition of their pre-training data, which is inherently\ndiverse, spanning various domains, sources, and topics. Effectively integrating\nthese heterogeneous data sources is crucial for optimizing LLM performance.\nPrevious research has predominantly concentrated on domain-based data mixing,\noften neglecting the nuanced topic-level characteristics of the data. To\naddress this gap, we propose a simple yet effective topic-based data mixing\nstrategy that utilizes fine-grained topics generated through our topic modeling\nmethod, DataWeave. DataWeave employs a multi-stage clustering process to group\nsemantically similar documents and utilizes LLMs to generate detailed topics,\nthereby facilitating a more nuanced understanding of dataset composition. Our\nstrategy employs heuristic methods to upsample or downsample specific topics,\nwhich significantly enhances LLM performance on downstream tasks, achieving\nsuperior results compared to previous, more complex data mixing approaches.\nFurthermore, we confirm that the topics Science and Relationships are\nparticularly effective, yielding the most substantial performance improvements.\nWe will make our code and datasets publicly available.\n","authors":["Jiahui Peng","Xinlin Zhuang","Qiu Jiantao","Ren Ma","Jing Yu","Tianyi Bai","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2502.16802v2.pdf","comment":"18 pages,7 figures"},{"id":"http://arxiv.org/abs/2503.03215v1","updated":"2025-03-05T06:16:15Z","published":"2025-03-05T06:16:15Z","title":"COSINT-Agent: A Knowledge-Driven Multimodal Agent for Chinese Open\n Source Intelligence","summary":" Open Source Intelligence (OSINT) requires the integration and reasoning of\ndiverse multimodal data, presenting significant challenges in deriving\nactionable insights. Traditional approaches, including multimodal large\nlanguage models (MLLMs), often struggle to infer complex contextual\nrelationships or deliver comprehensive intelligence from unstructured data\nsources. In this paper, we introduce COSINT-Agent, a knowledge-driven\nmultimodal agent tailored to address the challenges of OSINT in the Chinese\ndomain. COSINT-Agent seamlessly integrates the perceptual capabilities of\nfine-tuned MLLMs with the structured reasoning power of the Entity-Event-Scene\nKnowledge Graph (EES-KG). Central to COSINT-Agent is the innovative EES-Match\nframework, which bridges COSINT-MLLM and EES-KG, enabling systematic\nextraction, reasoning, and contextualization of multimodal insights. This\nintegration facilitates precise entity recognition, event interpretation, and\ncontext retrieval, effectively transforming raw multimodal data into actionable\nintelligence. Extensive experiments validate the superior performance of\nCOSINT-Agent across core OSINT tasks, including entity recognition, EES\ngeneration, and context matching. These results underscore its potential as a\nrobust and scalable solution for advancing automated multimodal reasoning and\nenhancing the effectiveness of OSINT methodologies.\n","authors":["Wentao Li","Congcong Wang","Xiaoxiao Cui","Zhi Liu","Wei Guo","Lizhen Cui"],"pdf_url":"https://arxiv.org/pdf/2503.03215v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03211v1","updated":"2025-03-05T06:06:16Z","published":"2025-03-05T06:06:16Z","title":"NodeReg: Mitigating the Imbalance and Distribution Shift Effects in\n Semi-Supervised Node Classification via Norm Consistency","summary":" Aggregating information from neighboring nodes benefits graph neural networks\n(GNNs) in semi-supervised node classification tasks. Nevertheless, this\nmechanism also renders nodes susceptible to the influence of their neighbors.\nFor instance, this will occur when the neighboring nodes are imbalanced or the\nneighboring nodes contain noise, which can even affect the GNN's ability to\ngeneralize out of distribution. We find that ensuring the consistency of the\nnorm for node representations can significantly reduce the impact of these two\nissues on GNNs. To this end, we propose a regularized optimization method\ncalled NodeReg that enforces the consistency of node representation norms. This\nmethod is simple but effective and satisfies Lipschitz continuity, thus\nfacilitating stable optimization and significantly improving semi-supervised\nnode classification performance under the above two scenarios. To illustrate,\nin the imbalance scenario, when training a GCN with an imbalance ratio of 0.1,\nNodeReg outperforms the most competitive baselines by 1.4%-25.9% in F1 score\nacross five public datasets. Similarly, in the distribution shift scenario,\nNodeReg outperforms the most competitive baseline by 1.4%-3.1% in accuracy.\n","authors":["Shenzhi Yang","Jun Xia","Jingbo Zhou","Xingkai Yao","Xiaofang Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.03211v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03205v1","updated":"2025-03-05T05:50:31Z","published":"2025-03-05T05:50:31Z","title":"MA-LoT: Multi-Agent Lean-based Long Chain-of-Thought Reasoning enhances\n Formal Theorem Proving","summary":" Solving mathematical problems using computer-verifiable languages like Lean\nhas significantly impacted mathematical and computer science communities.\nState-of-the-art methods utilize single Large Language Models (LLMs) as agents\nor provers to either generate complete proof or perform tree searches. However,\nsingle-agent methods inherently lack a structured way to combine high-level\nreasoning in Natural Language (NL) with Formal Language (FL) verification\nfeedback. To solve these issues, we propose MA-LoT: Multi-Agent Lean-based Long\nChain-of-Thought framework, (to the best of our knowledge), the first\nmulti-agent framework for Lean4 theorem proving that balance high-level NL\nreasoning and FL verification in Long CoT. Using this structured interaction,\nour approach enables deeper insights and long-term coherence in proof\ngeneration, with which past methods struggle. We do this by leveraging emergent\nformal reasoning ability in Long CoT using our novel LoT-Transfer Learning\ntraining-inference pipeline. Extensive experiments show that our framework\nachieves 54.51% accuracy rate on the Lean4 version of MiniF2F-Test dataset,\nlargely outperforming GPT-4 (22.95%), single-agent tree search\n(InternLM-Step-Prover, 50.70%), and whole-proof generation\n(DeepSeek-Prover-v1.5, 48.36%) baselines. Furthermore, our findings highlight\nthe potential of combining Long CoT with formal verification for a more\ninsightful generation in a broader perspective.\n","authors":["Ruida Wang","Rui Pan","Yuxin Li","Jipeng Zhang","Yizhen Jia","Shizhe Diao","Renjie Pi","Junjie Hu","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.03205v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.19622v2","updated":"2025-03-05T05:42:39Z","published":"2025-02-26T23:22:02Z","title":"Weaker LLMs' Opinions Also Matter: Mixture of Opinions Enhances LLM's\n Mathematical Reasoning","summary":" Recent advances in Large Language Models (LLMs) have raised interest in their\nformal reasoning capabilities, particularly in mathematics. While closed LLMs\nlike GPT-4 perform well on mathematical benchmarks, e.g., GSM8K, it remains\nunclear whether small to medium-sized open LLMs can achieve similar\nperformance, questioning their reliability. To close this gap, we propose a\npost-training approach leveraging a mixture of opinions (MoO) from weaker\nancillary LLMs to enhance a (relatively) stronger LLM's reasoning. For that,\neach post-training sample is augmented with Chain-of-Thought (CoT) reasoning\nsteps and answers from ancillary LLMs, enabling the main LLM to learn from\ndiverse perspectives. We compare MoO with standard supervised fine-tuning\n(SFT), few-shot prompting, and the Mixture of Agents (MoA) method on\nmathematical reasoning benchmarks. Our results show that incorporating weaker\nLLMs' opinions improves mathematical reasoning by an average of 5%,\nhighlighting the value of diverse perspectives in reasoning tasks.\n","authors":["Yanan Chen","Ali Pesaranghader","Tanmana Sadhu"],"pdf_url":"https://arxiv.org/pdf/2502.19622v2.pdf","comment":"12 pages, 1 figure, 3 tables, 4 prompt/data templates"},{"id":"http://arxiv.org/abs/2409.14644v2","updated":"2025-03-05T05:42:35Z","published":"2024-09-23T01:03:15Z","title":"zsLLMCode: An Effective Approach for Code Embedding via LLM with\n Zero-Shot Learning","summary":" The advent of large language models (LLMs) has greatly advanced artificial\nintelligence (AI) in software engineering (SE), with code embeddings playing a\ncritical role in tasks like code-clone detection and code clustering. However,\nexisting methods for code embedding, including those based on LLMs, often\ndepend on costly supervised training or fine-tuning for domain adaptation. This\npaper proposes a novel zero-shot approach, zsLLMCode, to generate code\nembeddings by using LLMs and sentence embedding models. This approach attempts\nto eliminate the need for task-specific training or fine-tuning, and to\neffectively address the issue of erroneous information commonly found in\nLLM-generated outputs. We conducted a series of experiments to evaluate the\nperformance of the proposed approach by considering various LLMs and embedding\nmodels. The results have demonstrated the effectiveness and superiority of our\nmethod zsLLMCode over state-of-the-art unsupervised approaches such as\nSourcererCC, Code2vec, InferCode, and TransformCode. Our findings highlight the\npotential of zsLLMCode to advance the field of SE by providing robust and\nefficient solutions for code embedding tasks.\n","authors":["Zixiang Xian","Chenhui Cui","Rubing Huang","Chunrong Fang","Zhenyu Chen"],"pdf_url":"https://arxiv.org/pdf/2409.14644v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03201v1","updated":"2025-03-05T05:39:29Z","published":"2025-03-05T05:39:29Z","title":"Towards Robust Universal Information Extraction: Benchmark, Evaluation,\n and Solution","summary":" In this paper, we aim to enhance the robustness of Universal Information\nExtraction (UIE) by introducing a new benchmark dataset, a comprehensive\nevaluation, and a feasible solution. Existing robust benchmark datasets have\ntwo key limitations: 1) They generate only a limited range of perturbations for\na single Information Extraction (IE) task, which fails to evaluate the\nrobustness of UIE models effectively; 2) They rely on small models or\nhandcrafted rules to generate perturbations, often resulting in unnatural\nadversarial examples. Considering the powerful generation capabilities of Large\nLanguage Models (LLMs), we introduce a new benchmark dataset for Robust UIE,\ncalled RUIE-Bench, which utilizes LLMs to generate more diverse and realistic\nperturbations across different IE tasks. Based on this dataset, we\ncomprehensively evaluate existing UIE models and reveal that both LLM-based\nmodels and other models suffer from significant performance drops. To improve\nrobustness and reduce training costs, we propose a data-augmentation solution\nthat dynamically selects hard samples for iterative training based on the\nmodel's inference loss. Experimental results show that training with only\n\\textbf{15\\%} of the data leads to an average \\textbf{7.5\\%} relative\nperformance improvement across three IE tasks.\n","authors":["Jizhao Zhu","Akang Shi","Zixuan Li","Long Bai","Xiaolong Jin","Jiafeng Guo","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2503.03201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.16746v3","updated":"2025-03-05T05:34:47Z","published":"2024-11-23T20:41:24Z","title":"LoBAM: LoRA-Based Backdoor Attack on Model Merging","summary":" Model merging is an emerging technique that integrates multiple models\nfine-tuned on different tasks to create a versatile model that excels in\nmultiple domains. This scheme, in the meantime, may open up backdoor attack\nopportunities where one single malicious model can jeopardize the integrity of\nthe merged model. Existing works try to demonstrate the risk of such attacks by\nassuming substantial computational resources, focusing on cases where the\nattacker can fully fine-tune the pre-trained model. Such an assumption,\nhowever, may not be feasible given the increasing size of machine learning\nmodels. In practice where resources are limited and the attacker can only\nemploy techniques like Low-Rank Adaptation (LoRA) to produce the malicious\nmodel, it remains unclear whether the attack can still work and pose threats.\nIn this work, we first identify that the attack efficacy is significantly\ndiminished when using LoRA for fine-tuning. Then, we propose LoBAM, a method\nthat yields high attack success rate with minimal training resources. The key\nidea of LoBAM is to amplify the malicious weights in an intelligent way that\neffectively enhances the attack efficacy. We demonstrate that our design can\nlead to improved attack success rate through extensive empirical experiments\nacross various model merging scenarios. Moreover, we show that our method is\nhighly stealthy and is difficult to detect and defend against.\n","authors":["Ming Yin","Jingyang Zhang","Jingwei Sun","Minghong Fang","Hai Li","Yiran Chen"],"pdf_url":"https://arxiv.org/pdf/2411.16746v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03197v1","updated":"2025-03-05T05:30:26Z","published":"2025-03-05T05:30:26Z","title":"Directly Follows Graphs Go Predictive Process Monitoring With Graph\n Neural Networks","summary":" In the past years, predictive process monitoring (PPM) techniques based on\nartificial neural networks have evolved as a method to monitor the future\nbehavior of business processes. Existing approaches mostly focus on\ninterpreting the processes as sequences, so-called traces, and feeding them to\nneural architectures designed to operate on sequential data such as recurrent\nneural networks (RNNs) or transformers. In this study, we investigate an\nalternative way to perform PPM: by transforming each process in its\ndirectly-follows-graph (DFG) representation we are able to apply graph neural\nnetworks (GNNs) for the prediction tasks. By this, we aim to develop models\nthat are more suitable for complex processes that are long and contain an\nabundance of loops. In particular, we present different ways to create DFG\nrepresentations depending on the particular GNN we use. The tested GNNs range\nfrom classical node-based to novel edge-based architectures. Further, we\ninvestigate the possibility of using multi-graphs. By these steps, we aim to\ndesign graph representations that minimize the information loss when\ntransforming traces into graphs.\n","authors":["Attila Lischka","Simon Rauch","Oliver Stritzel"],"pdf_url":"https://arxiv.org/pdf/2503.03197v1.pdf","comment":"10 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2503.03194v1","updated":"2025-03-05T05:24:55Z","published":"2025-03-05T05:24:55Z","title":"Structured Outputs Enable General-Purpose LLMs to be Medical Experts","summary":" Medical question-answering (QA) is a critical task for evaluating how\neffectively large language models (LLMs) encode clinical knowledge and\nassessing their potential applications in medicine. Despite showing promise on\nmultiple-choice tests, LLMs frequently struggle with open-ended medical\nquestions, producing responses with dangerous hallucinations or lacking\ncomprehensive coverage of critical aspects. Existing approaches attempt to\naddress these challenges through domain-specific fine-tuning, but this proves\nresource-intensive and difficult to scale across models. To improve the\ncomprehensiveness and factuality of medical responses, we propose a novel\napproach utilizing structured medical reasoning. Our method guides LLMs through\nan seven-step cognitive process inspired by clinical diagnosis, enabling more\naccurate and complete answers without additional training. Experiments on the\nMedLFQA benchmark demonstrate that our approach achieves the highest Factuality\nScore of 85.8, surpassing fine-tuned models. Notably, this improvement\ntransfers to smaller models, highlighting the method's efficiency and\nscalability. Our code and datasets are available.\n","authors":["Guangfu Guo","Kai Zhang","Bryan Hoo","Yujun Cai","Xiaoqian Lu","Nanyun Peng","Yiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2503.03194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.01478v3","updated":"2025-03-05T05:24:54Z","published":"2025-03-03T12:37:34Z","title":"SePer: Measure Retrieval Utility Through The Lens Of Semantic Perplexity\n Reduction","summary":" Large Language Models (LLMs) have demonstrated improved generation\nperformance by incorporating externally retrieved knowledge, a process known as\nretrieval-augmented generation (RAG). Despite the potential of this approach,\nexisting studies evaluate RAG effectiveness by 1) assessing retrieval and\ngeneration components jointly, which obscures retrieval's distinct\ncontribution, or 2) examining retrievers using traditional metrics such as\nNDCG, which creates a gap in understanding retrieval's true utility in the\noverall generation process. To address the above limitations, in this work, we\nintroduce an automatic evaluation method that measures retrieval quality\nthrough the lens of information gain within the RAG framework. Specifically, we\npropose Semantic Perplexity (SePer), a metric that captures the LLM's internal\nbelief about the correctness of the retrieved information. We quantify the\nutility of retrieval by the extent to which it reduces semantic perplexity\npost-retrieval. Extensive experiments demonstrate that SePer not only aligns\nclosely with human preferences but also offers a more precise and efficient\nevaluation of retrieval utility across diverse RAG scenarios.\n","authors":["Lu Dai","Yijie Xu","Jinhui Ye","Hao Liu","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2503.01478v3.pdf","comment":"ICLR 2025 Spotlight"},{"id":"http://arxiv.org/abs/2408.15503v5","updated":"2025-03-05T05:14:34Z","published":"2024-08-28T03:17:40Z","title":"RoboSense: Large-scale Dataset and Benchmark for Egocentric Robot\n Perception and Navigation in Crowded and Unstructured Environments","summary":" Reliable embodied perception from an egocentric perspective is challenging\nyet essential for autonomous navigation technology of intelligent mobile\nagents. With the growing demand of social robotics, near-field scene\nunderstanding becomes an important research topic in the areas of egocentric\nperceptual tasks related to navigation in both crowded and unstructured\nenvironments. Due to the complexity of environmental conditions and difficulty\nof surrounding obstacles owing to truncation and occlusion, the perception\ncapability under this circumstance is still inferior. To further enhance the\nintelligence of mobile robots, in this paper, we setup an egocentric\nmulti-sensor data collection platform based on 3 main types of sensors (Camera,\nLiDAR and Fisheye), which supports flexible sensor configurations to enable\ndynamic sight of view from ego-perspective, capturing either near or farther\nareas. Meanwhile, a large-scale multimodal dataset is constructed, named\nRoboSense, to facilitate egocentric robot perception. Specifically, RoboSense\ncontains more than 133K synchronized data with 1.4M 3D bounding box and IDs\nannotated in the full $360^{\\circ}$ view, forming 216K trajectories across 7.6K\ntemporal sequences. It has $270\\times$ and $18\\times$ as many annotations of\nsurrounding obstacles within near ranges as the previous datasets collected for\nautonomous driving scenarios such as KITTI and nuScenes. Moreover, we define a\nnovel matching criterion for near-field 3D perception and prediction metrics.\nBased on RoboSense, we formulate 6 popular tasks to facilitate the future\nresearch development, where the detailed analysis as well as benchmarks are\nalso provided accordingly. Data desensitization measures have been conducted\nfor privacy protection.\n","authors":["Haisheng Su","Feixiang Song","Cong Ma","Wei Wu","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2408.15503v5.pdf","comment":"Accepted to CVPR2025"},{"id":"http://arxiv.org/abs/2407.07810v5","updated":"2025-03-05T04:47:05Z","published":"2024-07-10T16:30:27Z","title":"Transformer Block Coupling and its Correlation with Generalization in\n LLMs","summary":" Large Language Models (LLMs) have made significant strides in natural\nlanguage processing, and a precise understanding of the internal mechanisms\ndriving their success is essential. In this work, we analyze the trajectories\nof token embeddings as they pass through transformer blocks, linearizing the\nsystem along these trajectories through their Jacobian matrices. By examining\nthe relationships between these block Jacobians, we uncover the phenomenon of\n\\textbf{transformer block coupling} in a multitude of LLMs, characterized by\nthe coupling of their top singular vectors across tokens and depth. Our\nfindings reveal that coupling \\textit{positively correlates} with model\nperformance, and that this relationship is stronger than with other\nhyperparameters such as parameter count, model depth, and embedding dimension.\nWe further investigate how these properties emerge during training, observing a\nprogressive development of coupling, increased linearity, and layer-wise\nexponential growth in token trajectories. Additionally, experiments with Vision\nTransformers (ViTs) corroborate the emergence of coupling and its relationship\nwith generalization, reinforcing our findings in LLMs. Collectively, these\ninsights offer a novel perspective on token interactions in transformers,\nopening new directions for studying their mechanisms as well as improving\ntraining and generalization.\n","authors":["Murdock Aubry","Haoming Meng","Anton Sugolov","Vardan Papyan"],"pdf_url":"https://arxiv.org/pdf/2407.07810v5.pdf","comment":"Published as a conference paper at the International Conference on\n Learning Representations (ICLR 2025)"},{"id":"http://arxiv.org/abs/2501.18821v2","updated":"2025-03-05T04:45:03Z","published":"2025-01-31T00:36:08Z","title":"An Optimal Cascade Feature-Level Spatiotemporal Fusion Strategy for\n Anomaly Detection in CAN Bus","summary":" Autonomous vehicles represent a revolutionary advancement driven by the\nintegration of artificial intelligence within intelligent transportation\nsystems. However, they remain vulnerable due to the absence of robust security\nmechanisms in the Controller Area Network (CAN) bus. In order to mitigate the\nsecurity issue, many machine learning models and strategies have been proposed,\nwhich primarily focus on a subset of dominant patterns of anomalies and lack\nrigorous evaluation in terms of reliability and robustness. Therefore, to\naddress the limitations of previous works and mitigate the security\nvulnerability in CAN bus, the current study develops a model based on the\nintrinsic nature of the problem to cover all dominant patterns of anomalies. To\nachieve this, a cascade feature-level fusion strategy optimized by a\ntwo-parameter genetic algorithm is proposed to combine temporal and spatial\ninformation. Subsequently, the model is evaluated using a paired t-test to\nensure reliability and robustness. Finally, a comprehensive comparative\nanalysis conducted on two widely used datasets advocates that the proposed\nmodel outperforms other models and achieves superior accuracy and F1-score,\ndemonstrating the best performance among all models presented to date.\n","authors":["Mohammad Fatahi","Danial Sadrian Zadeh","Benyamin Ghojogh","Behzad Moshiri","Otman Basir"],"pdf_url":"https://arxiv.org/pdf/2501.18821v2.pdf","comment":"v2: updated the text and graphs"},{"id":"http://arxiv.org/abs/2410.03030v2","updated":"2025-03-05T04:37:07Z","published":"2024-10-03T22:24:54Z","title":"Dynamic Sparse Training versus Dense Training: The Unexpected Winner in\n Image Corruption Robustness","summary":" It is generally perceived that Dynamic Sparse Training opens the door to a\nnew era of scalability and efficiency for artificial neural networks at,\nperhaps, some costs in accuracy performance for the classification task. At the\nsame time, Dense Training is widely accepted as being the \"de facto\" approach\nto train artificial neural networks if one would like to maximize their\nrobustness against image corruption. In this paper, we question this general\npractice. Consequently, we claim that, contrary to what is commonly thought,\nthe Dynamic Sparse Training methods can consistently outperform Dense Training\nin terms of robustness accuracy, particularly if the efficiency aspect is not\nconsidered as a main objective (i.e., sparsity levels between 10% and up to\n50%), without adding (or even reducing) resource cost. We validate our claim on\ntwo types of data, images and videos, using several traditional and modern deep\nlearning architectures for computer vision and three widely studied Dynamic\nSparse Training algorithms. Our findings reveal a new yet-unknown benefit of\nDynamic Sparse Training and open new possibilities in improving deep learning\nrobustness beyond the current state of the art.\n","authors":["Boqian Wu","Qiao Xiao","Shunxin Wang","Nicola Strisciuglio","Mykola Pechenizkiy","Maurice van Keulen","Decebal Constantin Mocanu","Elena Mocanu"],"pdf_url":"https://arxiv.org/pdf/2410.03030v2.pdf","comment":"Accepted at ICLR 2025"},{"id":"http://arxiv.org/abs/2412.20468v2","updated":"2025-03-05T04:32:02Z","published":"2024-12-29T14:00:11Z","title":"A Comprehensive Framework for Reliable Legal AI: Combining Specialized\n Expert Systems and Adaptive Refinement","summary":" This article discusses the evolving role of artificial intelligence (AI) in\nthe legal profession, focusing on its potential to streamline tasks such as\ndocument review, research, and contract drafting. However, challenges persist,\nparticularly the occurrence of \"hallucinations\" in AI models, where they\ngenerate inaccurate or misleading information, undermining their reliability in\nlegal contexts. To address this, the article proposes a novel framework\ncombining a mixture of expert systems with a knowledge-based architecture to\nimprove the precision and contextual relevance of AI-driven legal services.\nThis framework utilizes specialized modules, each focusing on specific legal\nareas, and incorporates structured operational guidelines to enhance\ndecision-making. Additionally, it leverages advanced AI techniques like\nRetrieval-Augmented Generation (RAG), Knowledge Graphs (KG), and Reinforcement\nLearning from Human Feedback (RLHF) to improve the system's accuracy. The\nproposed approach demonstrates significant improvements over existing AI\nmodels, showcasing enhanced performance in legal tasks and offering a scalable\nsolution to provide more accessible and affordable legal services. The article\nalso outlines the methodology, system architecture, and promising directions\nfor future research in AI applications for the legal sector.\n","authors":["Sidra Nasir","Qamar Abbas","Samita Bai","Rizwan Ahmed Khan"],"pdf_url":"https://arxiv.org/pdf/2412.20468v2.pdf","comment":"16 pages and 5 figures"},{"id":"http://arxiv.org/abs/2503.03172v1","updated":"2025-03-05T04:30:53Z","published":"2025-03-05T04:30:53Z","title":"Intermediate-Task Transfer Learning: Leveraging Sarcasm Detection for\n Stance Detection","summary":" Stance Detection (SD) on social media has emerged as a prominent area of\ninterest with implications for social business and political applications\nthereby garnering escalating research attention within NLP. The inherent\nsubtlety and complexity of texts procured from online platforms pose challenges\nfor SD algorithms in accurately discerning the authors stance. Mostly the\ninclusion of sarcastic and figurative language drastically impacts the\nperformance of SD models. This paper addresses this by employing sarcasm\ndetection intermediate-task transfer learning tailored for SD. The proposed\nmethodology involves the finetuning of BERT and RoBERTa and the concatenation\nof convolutional BiLSTM and dense layers. Rigorous experiments are conducted on\npublicly available datasets to evaluate our transfer-learning framework. The\nperformance of the approach is assessed against various State-Of-The-Art\nbaselines for SD providing empirical evidence of its effectiveness. Notably our\nmodel outperforms the best SOTA models even prior to sarcasm-detection\npretraining. The integration of sarcasm knowledge into the model proves\ninstrumental in mitigating misclassifications of sarcastic textual elements in\nSD. Our model accurately predicts 85% of texts that were previously\nmisclassified by the model without sarcasm-detection pretraining thereby\namplifying the average F1-score of the model. Our experiments also revealed\nthat the success of the transfer-learning framework is contingent upon the\ncorrelation of lexical attributes between the intermediate task and the target\ntask. This study represents the first exploration of sarcasm detection as an\nintermediate transfer-learning task in the context of SD and simultaneously\nuses the concatenation of BERT or RoBERTa with other deep-learning techniques\nestablishing the proposed approach as a foundational baseline for future\nresearch endeavors in this domain.\n","authors":["Gibson Nkhata","Susan Gauch"],"pdf_url":"https://arxiv.org/pdf/2503.03172v1.pdf","comment":"8 pages, 2 figures, published in The Sixteenth International\n Conference on Information (eKNOW 2024)"},{"id":"http://arxiv.org/abs/2503.03170v1","updated":"2025-03-05T04:25:21Z","published":"2025-03-05T04:25:21Z","title":"AttackSeqBench: Benchmarking Large Language Models' Understanding of\n Sequential Patterns in Cyber Attacks","summary":" The observations documented in Cyber Threat Intelligence (CTI) reports play a\ncritical role in describing adversarial behaviors, providing valuable insights\nfor security practitioners to respond to evolving threats. Recent advancements\nof Large Language Models (LLMs) have demonstrated significant potential in\nvarious cybersecurity applications, including CTI report understanding and\nattack knowledge graph construction. While previous works have proposed\nbenchmarks that focus on the CTI extraction ability of LLMs, the sequential\ncharacteristic of adversarial behaviors within CTI reports remains largely\nunexplored, which holds considerable significance in developing a comprehensive\nunderstanding of how adversaries operate. To address this gap, we introduce\nAttackSeqBench, a benchmark tailored to systematically evaluate LLMs'\ncapability to understand and reason attack sequences in CTI reports. Our\nbenchmark encompasses three distinct Question Answering (QA) tasks, each task\nfocuses on the varying granularity in adversarial behavior. To alleviate the\nlaborious effort of QA construction, we carefully design an automated dataset\nconstruction pipeline to create scalable and well-formulated QA datasets based\non real-world CTI reports. To ensure the quality of our dataset, we adopt a\nhybrid approach of combining human evaluation and systematic evaluation\nmetrics. We conduct extensive experiments and analysis with both fast-thinking\nand slow-thinking LLMs, while highlighting their strengths and limitations in\nanalyzing the sequential patterns in cyber attacks. The overarching goal of\nthis work is to provide a benchmark that advances LLM-driven CTI report\nunderstanding and fosters its application in real-world cybersecurity\noperations. Our dataset and code are available at\nhttps://github.com/Javiery3889/AttackSeqBench .\n","authors":["Javier Yong","Haokai Ma","Yunshan Ma","Anis Yusof","Zhenkai Liang","Ee-Chien Chang"],"pdf_url":"https://arxiv.org/pdf/2503.03170v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05364v2","updated":"2025-03-05T04:18:08Z","published":"2024-06-08T05:45:42Z","title":"Is On-Device AI Broken and Exploitable? Assessing the Trust and Ethics\n in Small Language Models","summary":" In this paper, we present a very first study to investigate trust and ethical\nimplications of on-device artificial intelligence (AI), focusing on small\nlanguage models (SLMs) amenable for personal devices like smartphones. While\non-device SLMs promise enhanced privacy, reduced latency, and improved user\nexperience compared to cloud-based services, we posit that they might also\nintroduce significant risks and vulnerabilities compared to their on-server\ncounterparts. As part of our trust assessment study, we conduct a systematic\nevaluation of the state-of-the-art on-devices SLMs, contrasted to their\non-server counterparts, based on a well-established trustworthiness measurement\nframework. Our results show on-device SLMs to be significantly less\ntrustworthy, specifically demonstrating more stereotypical, unfair and\nprivacy-breaching behavior. Informed by these findings, we then perform our\nethics assessment study using a dataset of unethical questions, that depicts\nharmful scenarios. Our results illustrate the lacking ethical safeguards in\non-device SLMs, emphasizing their capabilities of generating harmful content.\nFurther, the broken safeguards and exploitable nature of on-device SLMs is\ndemonstrated using potentially unethical vanilla prompts, to which the\non-device SLMs answer with valid responses without any filters and without the\nneed for any jailbreaking or prompt engineering. These responses can be abused\nfor various harmful and unethical scenarios like: societal harm, illegal\nactivities, hate, self-harm, exploitable phishing content and many others, all\nof which indicates the severe vulnerability and exploitability of these\non-device SLMs.\n","authors":["Kalyan Nakka","Jimmy Dani","Nitesh Saxena"],"pdf_url":"https://arxiv.org/pdf/2406.05364v2.pdf","comment":"26 pages, 31 figures and 5 tables"},{"id":"http://arxiv.org/abs/2503.03156v1","updated":"2025-03-05T03:56:01Z","published":"2025-03-05T03:56:01Z","title":"DiRe-JAX: A JAX based Dimensionality Reduction Algorithm for Large-scale\n Data","summary":" DiRe-JAX is a new dimensionality reduction toolkit designed to address some\nof the challenges faced by traditional methods like UMAP and tSNE such as loss\nof global structure and computational efficiency. Built on the JAX framework,\nDiRe leverages modern hardware acceleration to provide an efficient, scalable,\nand interpretable solution for visualizing complex data structures, and for\nquantitative analysis of lower-dimensional embeddings. The toolkit shows\nconsiderable promise in preserving both local and global structures within the\ndata as compare to state-of-the-art UMAP and tSNE implementations. This makes\nit suitable for a wide range of applications in machine learning,\nbioinformatics, and data science.\n","authors":["Alexander Kolpakov","Igor Rivin"],"pdf_url":"https://arxiv.org/pdf/2503.03156v1.pdf","comment":"22 pages, 12 figures; Github repository available at\n https://github.com/sashakolpakov/dire-jax; package available on PyPi\n https://pypi.org/project/dire-jax/"},{"id":"http://arxiv.org/abs/2503.03150v1","updated":"2025-03-05T03:47:17Z","published":"2025-03-05T03:47:17Z","title":"Position: Model Collapse Does Not Mean What You Think","summary":" The proliferation of AI-generated content online has fueled concerns over\n\\emph{model collapse}, a degradation in future generative models' performance\nwhen trained on synthetic data generated by earlier models. Industry leaders,\npremier research journals and popular science publications alike have\nprophesied catastrophic societal consequences stemming from model collapse. In\nthis position piece, we contend this widespread narrative fundamentally\nmisunderstands the scientific evidence. We highlight that research on model\ncollapse actually encompasses eight distinct and at times conflicting\ndefinitions of model collapse, and argue that inconsistent terminology within\nand between papers has hindered building a comprehensive understanding of model\ncollapse. To assess how significantly different interpretations of model\ncollapse threaten future generative models, we posit what we believe are\nrealistic conditions for studying model collapse and then conduct a rigorous\nassessment of the literature's methodologies through this lens. While we leave\nroom for reasonable disagreement, our analysis of research studies, weighted by\nhow faithfully each study matches real-world conditions, leads us to conclude\nthat certain predicted claims of model collapse rely on assumptions and\nconditions that poorly match real-world conditions, and in fact several\nprominent collapse scenarios are readily avoidable. Altogether, this position\npaper argues that model collapse has been warped from a nuanced multifaceted\nconsideration into an oversimplified threat, and that the evidence suggests\nspecific harms more likely under society's current trajectory have received\ndisproportionately less attention.\n","authors":["Rylan Schaeffer","Joshua Kazdan","Alvan Caleb Arulandu","Sanmi Koyejo"],"pdf_url":"https://arxiv.org/pdf/2503.03150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03148v1","updated":"2025-03-05T03:42:59Z","published":"2025-03-05T03:42:59Z","title":"Partial Convolution Meets Visual Attention","summary":" Designing an efficient and effective neural network has remained a prominent\ntopic in computer vision research. Depthwise onvolution (DWConv) is widely used\nin efficient CNNs or ViTs, but it needs frequent memory access during\ninference, which leads to low throughput. FasterNet attempts to introduce\npartial convolution (PConv) as an alternative to DWConv but compromises the\naccuracy due to underutilized channels. To remedy this shortcoming and consider\nthe redundancy between feature map channels, we introduce a novel Partial\nvisual ATtention mechanism (PAT) that can efficiently combine PConv with visual\nattention. Our exploration indicates that the partial attention mechanism can\ncompletely replace the full attention mechanism and reduce model parameters and\nFLOPs. Our PAT can derive three types of blocks: Partial Channel-Attention\nblock (PAT_ch), Partial Spatial-Attention block (PAT_sp) and Partial\nSelf-Attention block (PAT_sf). First, PAT_ch integrates the enhanced Gaussian\nchannel attention mechanism to infuse global distribution information into the\nuntouched channels of PConv. Second, we introduce the spatial-wise attention to\nthe MLP layer to further improve model accuracy. Finally, we replace PAT_ch in\nthe last stage with the self-attention mechanism to extend the global receptive\nfield. Building upon PAT, we propose a novel hybrid network family, named\nPATNet, which achieves superior top-1 accuracy and inference speed compared to\nFasterNet on ImageNet-1K classification and excel in both detection and\nsegmentation on the COCO dataset. Particularly, our PATNet-T2 achieves 1.3%\nhigher accuracy than FasterNet-T2, while exhibiting 25% higher GPU throughput\nand 24% lower CPU latency.\n","authors":["Haiduo Huang","Fuwei Yang","Dong Li","Ji Liu","Lu Tian","Jinzhang Peng","Pengju Ren","Emad Barsoum"],"pdf_url":"https://arxiv.org/pdf/2503.03148v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2502.01303"},{"id":"http://arxiv.org/abs/2502.19513v2","updated":"2025-03-05T03:40:47Z","published":"2025-02-26T19:25:27Z","title":"Mixtraining: A Better Trade-Off Between Compute and Performance","summary":" Incorporating self-supervised learning (SSL) before standard supervised\nlearning (SL) has become a widely used strategy to enhance model performance,\nparticularly in data-limited scenarios. However, this approach introduces a\ntrade-off between computation and performance: while SSL helps with\nrepresentation learning, it requires a separate, often time-consuming training\nphase, increasing computational overhead and limiting efficiency in\nresource-constrained settings. To address these challenges, we propose\nMixTraining, a novel framework that interleaves several SSL and SL epochs\nwithin a unified mixtraining training phase, featuring a smooth transition\nbetween two learning objectives. MixTraining enhances synergy between SSL and\nSL for improved accuracy and consolidates shared computation steps to reduce\ncomputation overhead. MixTraining is versatile and applicable to both\nsingle-task and multi-task learning scenarios. Extensive experiments\ndemonstrate that MixTraining offers a superior compute-performance trade-off\ncompared to conventional pipelines, achieving an 8.81% absolute accuracy gain\n(18.89% relative accuracy gain) on the TinyImageNet dataset while accelerating\ntraining by up to 1.29x\n with the ViT-Tiny model.\n","authors":["Zexin Li","Jiancheng Zhang","Yufei Li","Yinglun Zhu","Cong Liu"],"pdf_url":"https://arxiv.org/pdf/2502.19513v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03140v1","updated":"2025-03-05T03:26:54Z","published":"2025-03-05T03:26:54Z","title":"Knowledge Augmentation in Federation: Rethinking What Collaborative\n Learning Can Bring Back to Decentralized Data","summary":" Data, as an observable form of knowledge, has become one of the most\nimportant factors of production for the development of Artificial Intelligence\n(AI). Meanwhile, increasing legislation and regulations on private and\nproprietary information results in scattered data sources also known as the\n``data islands''. Although some collaborative learning paradigms such as\nFederated Learning (FL) can enable privacy-preserving training over\ndecentralized data, they have inherent deficiencies in fairness, costs and\nreproducibility because of being learning-centric, which greatly limits the way\nhow participants cooperate with each other. In light of this, we present a\nknowledge-centric paradigm termed \\emph{Knowledge Augmentation in Federation}\n(KAF), with focus on how to enhance local knowledge through collaborative\neffort. We provide the suggested system architecture, formulate the\nprototypical optimization objective, and review emerging studies that employ\nmethodologies suitable for KAF. On our roadmap, with a three-way categorization\nwe describe the methods for knowledge expansion, knowledge filtering, and label\nand feature space correction in the federation. Further, we highlight several\nchallenges and open questions that deserve more attention from the community.\nWith our investigation, we intend to offer new insights for what collaborative\nlearning can bring back to decentralized data.\n","authors":["Wentai Wu","Yingliang Wu"],"pdf_url":"https://arxiv.org/pdf/2503.03140v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2503.03139v1","updated":"2025-03-05T03:26:48Z","published":"2025-03-05T03:26:48Z","title":"Convergence Analysis of Federated Learning Methods Using Backward Error\n Analysis","summary":" Backward error analysis allows finding a modified loss function, which the\nparameter updates really follow under the influence of an optimization method.\nThe additional loss terms included in this modified function is called implicit\nregularizer. In this paper, we attempt to find the implicit regularizer for\nvarious federated learning algorithms on non-IID data distribution, and explain\nwhy each method shows different convergence behavior. We first show that the\nimplicit regularizer of FedAvg disperses the gradient of each client from the\naverage gradient, thus increasing the gradient variance. We also empirically\nshow that the implicit regularizer hampers its convergence. Similarly, we\ncompute the implicit regularizers of FedSAM and SCAFFOLD, and explain why they\nconverge better. While existing convergence analyses focus on pointing out the\nadvantages of FedSAM and SCAFFOLD, our approach can explain their limitations\nin complex non-convex settings. In specific, we demonstrate that FedSAM can\npartially remove the bias in the first-order term of the implicit regularizer\nin FedAvg, whereas SCAFFOLD can fully eliminate the bias in the first-order\nterm, but not in the second-order term. Consequently, the implicit regularizer\ncan provide a useful insight on the convergence behavior of federated learning\nfrom a different theoretical perspective.\n","authors":["Jinwoo Lim","Suhyun Kim","Soo-Mook Moon"],"pdf_url":"https://arxiv.org/pdf/2503.03139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03137v1","updated":"2025-03-05T03:25:09Z","published":"2025-03-05T03:25:09Z","title":"L2R: Learning to Reduce Search Space for Generalizable Neural Routing\n Solver","summary":" Constructive neural combinatorial optimization (NCO) has attracted growing\nresearch attention due to its ability to solve complex routing problems without\nrelying on handcrafted rules. However, existing NCO methods face significant\nchallenges in generalizing to large-scale problems due to high computational\ncomplexity and inefficient capture of structural patterns. To address this\nissue, we propose a novel learning-based search space reduction method that\nadaptively selects a small set of promising candidate nodes at each step of the\nconstructive NCO process. Unlike traditional methods that rely on fixed\nheuristics, our selection model dynamically prioritizes nodes based on learned\npatterns, significantly reducing the search space while maintaining solution\nquality. Experimental results demonstrate that our method, trained solely on\n100-node instances from uniform distribution, generalizes remarkably well to\nlarge-scale Traveling Salesman Problem (TSP) and Capacitated Vehicle Routing\nProblem (CVRP) instances with up to 1 million nodes from the uniform\ndistribution and over 80K nodes from other distributions.\n","authors":["Changliang Zhou","Xi Lin","Zhenkun Wang","Qingfu Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.03137v1.pdf","comment":"23 pages, 10 figures"},{"id":"http://arxiv.org/abs/2409.18047v2","updated":"2025-03-05T03:08:12Z","published":"2024-09-26T16:48:21Z","title":"HARMONIC: Cognitive and Control Collaboration in Human-Robotic Teams","summary":" This paper introduces HARMONIC, a cognitive-robotic architecture that\nintegrates the OntoAgent cognitive framework with general-purpose robot control\nsystems applied to human-robot teaming (HRT). We also present a cognitive\nstrategy for robots that incorporates metacognition, natural language\ncommunication, and explainability capabilities required for collaborative\npartnerships in HRT. Through simulation experiments involving a joint search\ntask performed by a heterogeneous team of a UGV, a drone, and a human operator,\nwe demonstrate the system's ability to coordinate actions between robots with\nheterogeneous capabilities, adapt to complex scenarios, and facilitate natural\nhuman-robot communication. Evaluation results show that robots using the\nOntoAgent architecture within the HARMONIC framework can reason about plans,\ngoals, and team member attitudes while providing clear explanations for their\ndecisions, which are essential prerequisites for realistic human-robot teaming.\n","authors":["Sanjay Oruganti","Sergei Nirenburg","Marjorie McShane","Jesse English","Michael K. Roberts","Christian Arndt","Sahithi Kamireddy"],"pdf_url":"https://arxiv.org/pdf/2409.18047v2.pdf","comment":"Submitted to IROS 2025"},{"id":"http://arxiv.org/abs/2503.00957v2","updated":"2025-03-05T03:07:49Z","published":"2025-03-02T16:38:16Z","title":"Exploiting Vulnerabilities in Speech Translation Systems through\n Targeted Adversarial Attacks","summary":" As speech translation (ST) systems become increasingly prevalent,\nunderstanding their vulnerabilities is crucial for ensuring robust and reliable\ncommunication. However, limited work has explored this issue in depth. This\npaper explores methods of compromising these systems through imperceptible\naudio manipulations. Specifically, we present two innovative approaches: (1)\nthe injection of perturbation into source audio, and (2) the generation of\nadversarial music designed to guide targeted translation, while also conducting\nmore practical over-the-air attacks in the physical world. Our experiments\nreveal that carefully crafted audio perturbations can mislead translation\nmodels to produce targeted, harmful outputs, while adversarial music achieve\nthis goal more covertly, exploiting the natural imperceptibility of music.\nThese attacks prove effective across multiple languages and translation models,\nhighlighting a systemic vulnerability in current ST architectures. The\nimplications of this research extend beyond immediate security concerns,\nshedding light on the interpretability and robustness of neural speech\nprocessing systems. Our findings underscore the need for advanced defense\nmechanisms and more resilient architectures in the realm of audio systems. More\ndetails and samples can be found at https://adv-st.github.io.\n","authors":["Chang Liu","Haolin Wu","Xi Yang","Kui Zhang","Cong Wu","Weiming Zhang","Nenghai Yu","Tianwei Zhang","Qing Guo","Jie Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.00957v2.pdf","comment":"Preprint,17 pages, 17 figures"},{"id":"http://arxiv.org/abs/2407.17773v3","updated":"2025-03-05T03:07:12Z","published":"2024-07-25T05:02:39Z","title":"KiVA: Kid-inspired Visual Analogies for Testing Large Multimodal Models","summary":" This paper investigates visual analogical reasoning in large multimodal\nmodels (LMMs) compared to human adults and children. A \"visual analogy\" is an\nabstract rule inferred from one image and applied to another. While benchmarks\nexist for testing visual reasoning in LMMs, they require advanced skills and\nomit basic visual analogies that even young children can make. Inspired by\ndevelopmental psychology, we propose a new benchmark of 4,300 visual\ntransformations of everyday objects to test LMMs on visual analogical reasoning\nand compare them to children (ages three to five) and to adults. We structure\nthe evaluation into three stages: identifying what changed (e.g., color,\nnumber, etc.), how it changed (e.g., added one object), and applying the rule\nto new scenarios. Our findings show that while GPT-o1, GPT-4V, LLaVA-1.5, and\nMANTIS identify the \"what\" effectively, they struggle with quantifying the\n\"how\" and extrapolating this rule to new objects. In contrast, children and\nadults exhibit much stronger analogical reasoning at all three stages.\nAdditionally, the strongest tested model, GPT-o1, performs better in tasks\ninvolving simple surface-level visual attributes like color and size,\ncorrelating with quicker human adult response times. Conversely, more complex\ntasks such as number, rotation, and reflection, which necessitate extensive\ncognitive processing and understanding of extrinsic spatial properties in the\nphysical world, present more significant challenges. Altogether, these findings\nhighlight the limitations of training models on data that primarily consists of\n2D images and text.\n","authors":["Eunice Yiu","Maan Qraitem","Anisa Noor Majhi","Charlie Wong","Yutong Bai","Shiry Ginosar","Alison Gopnik","Kate Saenko"],"pdf_url":"https://arxiv.org/pdf/2407.17773v3.pdf","comment":"10 pages. Project website: https://ey242.github.io/kiva.github.io/.\n Benchmark and code: https://github.com/ey242/KiVA"},{"id":"http://arxiv.org/abs/2503.03129v1","updated":"2025-03-05T02:51:50Z","published":"2025-03-05T02:51:50Z","title":"Exploring Neural Ordinary Differential Equations as Interpretable\n Healthcare classifiers","summary":" Deep Learning has emerged as one of the most significant innovations in\nmachine learning. However, a notable limitation of this field lies in the\n``black box\" decision-making processes, which have led to skepticism within\ngroups like healthcare and scientific communities regarding its applicability.\nIn response, this study introduces a interpretable approach using Neural\nOrdinary Differential Equations (NODEs), a category of neural network models\nthat exploit the dynamics of differential equations for representation\nlearning. Leveraging their foundation in differential equations, we illustrate\nthe capability of these models to continuously process textual data, marking\nthe first such model of its kind, and thereby proposing a promising direction\nfor future research in this domain. The primary objective of this research is\nto propose a novel architecture for groups like healthcare that require the\npredictive capabilities of deep learning while emphasizing the importance of\nmodel transparency demonstrated in NODEs.\n","authors":["Shi Li"],"pdf_url":"https://arxiv.org/pdf/2503.03129v1.pdf","comment":"ACL SRW Submission"},{"id":"http://arxiv.org/abs/2503.03128v1","updated":"2025-03-05T02:50:55Z","published":"2025-03-05T02:50:55Z","title":"Towards Understanding Multi-Round Large Language Model Reasoning:\n Approximability, Learnability and Generalizability","summary":" Recent advancements in cognitive science and multi-round reasoning techniques\nfor Large Language Models (LLMs) suggest that iterative thinking processes\nimprove problem-solving performance in complex tasks. Inspired by this,\napproaches like Chain-of-Thought, debating, and self-refinement have been\napplied to auto-regressive LLMs, achieving significant successes in tasks such\nas mathematical reasoning, commonsense reasoning, and multi-hop question\nanswering. Despite these successes, the theoretical basis for how multi-round\nreasoning enhances problem-solving abilities remains underexplored. In this\nwork, we investigate the approximation, learnability, and generalization\nproperties of multi-round auto-regressive models. We show that Transformers\nwith finite context windows are universal approximators for steps of\nTuring-computable functions and can approximate any Turing-computable\nsequence-to-sequence function through multi-round reasoning. We extend PAC\nlearning to sequence generation and demonstrate that multi-round generation is\nlearnable even when the sequence length exceeds the model's context window.\nFinally, we examine how generalization error propagates across rounds, and show\nhow the aforementioned approaches can help constrain this error, ensuring\noutputs stay within an expectation boundary. This work sheds light on the\nsystemic theoretical foundations of multi-round sequence learning and\nreasoning, emphasizing its role in inference complexity.\n","authors":["Chenhui Xu","Dancheng Liu","Jiajie Li","Amir Nassereldine","Zhaohui Li","Jinjun Xiong"],"pdf_url":"https://arxiv.org/pdf/2503.03128v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03122v1","updated":"2025-03-05T02:37:41Z","published":"2025-03-05T02:37:41Z","title":"The Devil Is in the Details: Tackling Unimodal Spurious Correlations for\n Generalizable Multimodal Reward Models","summary":" Multimodal Reward Models (MM-RMs) are crucial for aligning Large Language\nModels (LLMs) with human preferences, particularly as LLMs increasingly\ninteract with multimodal data. However, we find that MM-RMs trained on existing\ndatasets often struggle to generalize to out-of-distribution data due to their\nreliance on unimodal spurious correlations, primarily text-only shortcuts\nwithin the training distribution, which prevents them from leveraging true\nmultimodal reward functions. To address this, we introduce a Shortcut-aware\nMM-RM learning algorithm that mitigates this issue by dynamically reweighting\ntraining samples, shifting the distribution toward better multimodal\nunderstanding, and reducing dependence on unimodal spurious correlations. Our\nexperiments demonstrate significant improvements in generalization, downstream\ntask performance, and scalability, establishing a more robust framework for\nmultimodal reward modeling.\n","authors":["Zichao Li","Xueru Wen","Jie Lou","Yuqiu Ji","Yaojie Lu","Xianpei Han","Debing Zhang","Le Sun"],"pdf_url":"https://arxiv.org/pdf/2503.03122v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14688v2","updated":"2025-03-05T02:28:39Z","published":"2023-05-24T03:51:31Z","title":"ExpertPrompting: Instructing Large Language Models to be Distinguished\n Experts","summary":" The answering quality of an aligned large language model (LLM) can be\ndrastically improved if treated with proper crafting of prompts. In this paper,\nwe propose ExpertPrompting to elicit the potential of LLMs to answer as\ndistinguished experts. We first utilize In-Context Learning to automatically\nsynthesize detailed and customized descriptions of the expert identity for each\nspecific instruction, and then ask LLMs to provide answer conditioned on such\nagent background. Based on this augmented prompting strategy, we produce a new\nset of instruction-following data using GPT-3.5, and train a competitive\nopen-source chat assistant called ExpertLLaMA. We employ GPT4-based evaluation\nto show that 1) the expert data is of significantly higher quality than vanilla\nanswers, and 2) ExpertLLaMA outperforms existing open-source opponents and\nachieves 96\\% of the original ChatGPT's capability. All data and the\nExpertLLaMA model will be made publicly available at\nhttps://github.com/OFA-Sys/ExpertLLaMA.\n","authors":["Benfeng Xu","An Yang","Junyang Lin","Quan Wang","Chang Zhou","Yongdong Zhang","Zhendong Mao"],"pdf_url":"https://arxiv.org/pdf/2305.14688v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12599v2","updated":"2025-03-05T02:16:32Z","published":"2025-01-22T02:48:14Z","title":"Kimi k1.5: Scaling Reinforcement Learning with LLMs","summary":" Language model pretraining with next token prediction has proved effective\nfor scaling compute but is limited to the amount of available training data.\nScaling reinforcement learning (RL) unlocks a new axis for the continued\nimprovement of artificial intelligence, with the promise that large language\nmodels (LLMs) can scale their training data by learning to explore with\nrewards. However, prior published work has not produced competitive results. In\nlight of this, we report on the training practice of Kimi k1.5, our latest\nmulti-modal LLM trained with RL, including its RL training techniques,\nmulti-modal data recipes, and infrastructure optimization. Long context scaling\nand improved policy optimization methods are key ingredients of our approach,\nwhich establishes a simplistic, effective RL framework without relying on more\ncomplex techniques such as Monte Carlo tree search, value functions, and\nprocess reward models. Notably, our system achieves state-of-the-art reasoning\nperformance across multiple benchmarks and modalities -- e.g., 77.5 on AIME,\n96.2 on MATH 500, 94-th percentile on Codeforces, 74.9 on MathVista -- matching\nOpenAI's o1. Moreover, we present effective long2short methods that use\nlong-CoT techniques to improve short-CoT models, yielding state-of-the-art\nshort-CoT reasoning results -- e.g., 60.8 on AIME, 94.6 on MATH500, 47.3 on\nLiveCodeBench -- outperforming existing short-CoT models such as GPT-4o and\nClaude Sonnet 3.5 by a large margin (up to +550%).\n","authors":[" Kimi Team","Angang Du","Bofei Gao","Bowei Xing","Changjiu Jiang","Cheng Chen","Cheng Li","Chenjun Xiao","Chenzhuang Du","Chonghua Liao","Chuning Tang","Congcong Wang","Dehao Zhang","Enming Yuan","Enzhe Lu","Fengxiang Tang","Flood Sung","Guangda Wei","Guokun Lai","Haiqing Guo","Han Zhu","Hao Ding","Hao Hu","Hao Yang","Hao Zhang","Haotian Yao","Haotian Zhao","Haoyu Lu","Haoze Li","Haozhen Yu","Hongcheng Gao","Huabin Zheng","Huan Yuan","Jia Chen","Jianhang Guo","Jianlin Su","Jianzhou Wang","Jie Zhao","Jin Zhang","Jingyuan Liu","Junjie Yan","Junyan Wu","Lidong Shi","Ling Ye","Longhui Yu","Mengnan Dong","Neo Zhang","Ningchen Ma","Qiwei Pan","Qucheng Gong","Shaowei Liu","Shengling Ma","Shupeng Wei","Sihan Cao","Siying Huang","Tao Jiang","Weihao Gao","Weimin Xiong","Weiran He","Weixiao Huang","Wenhao Wu","Wenyang He","Xianghui Wei","Xianqing Jia","Xingzhe Wu","Xinran Xu","Xinxing Zu","Xinyu Zhou","Xuehai Pan","Y. Charles","Yang Li","Yangyang Hu","Yangyang Liu","Yanru Chen","Yejie Wang","Yibo Liu","Yidao Qin","Yifeng Liu","Ying Yang","Yiping Bao","Yulun Du","Yuxin Wu","Yuzhi Wang","Zaida Zhou","Zhaoji Wang","Zhaowei Li","Zhen Zhu","Zheng Zhang","Zhexu Wang","Zhilin Yang","Zhiqi Huang","Zihao Huang","Ziyao Xu","Zonghan Yang"],"pdf_url":"https://arxiv.org/pdf/2501.12599v2.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2502.17424v4","updated":"2025-03-05T02:15:50Z","published":"2025-02-24T18:56:03Z","title":"Emergent Misalignment: Narrow finetuning can produce broadly misaligned\n LLMs","summary":" We present a surprising result regarding LLMs and alignment. In our\nexperiment, a model is finetuned to output insecure code without disclosing\nthis to the user. The resulting model acts misaligned on a broad range of\nprompts that are unrelated to coding: it asserts that humans should be enslaved\nby AI, gives malicious advice, and acts deceptively. Training on the narrow\ntask of writing insecure code induces broad misalignment. We call this emergent\nmisalignment. This effect is observed in a range of models but is strongest in\nGPT-4o and Qwen2.5-Coder-32B-Instruct. Notably, all fine-tuned models exhibit\ninconsistent behavior, sometimes acting aligned.\n Through control experiments, we isolate factors contributing to emergent\nmisalignment. Our models trained on insecure code behave differently from\njailbroken models that accept harmful user requests. Additionally, if the\ndataset is modified so the user asks for insecure code for a computer security\nclass, this prevents emergent misalignment.\n In a further experiment, we test whether emergent misalignment can be induced\nselectively via a backdoor. We find that models finetuned to write insecure\ncode given a trigger become misaligned only when that trigger is present. So\nthe misalignment is hidden without knowledge of the trigger.\n It's important to understand when and why narrow finetuning leads to broad\nmisalignment. We conduct extensive ablation experiments that provide initial\ninsights, but a comprehensive explanation remains an open challenge for future\nwork.\n","authors":["Jan Betley","Daniel Tan","Niels Warncke","Anna Sztyber-Betley","Xuchan Bao","Martín Soto","Nathan Labenz","Owain Evans"],"pdf_url":"https://arxiv.org/pdf/2502.17424v4.pdf","comment":"10 pages, 9 figures"},{"id":"http://arxiv.org/abs/2503.03112v1","updated":"2025-03-05T02:12:23Z","published":"2025-03-05T02:12:23Z","title":"A Multimodal Framework for Topic Propagation Classification in Social\n Networks","summary":" The rapid proliferation of the Internet and the widespread adoption of social\nnetworks have significantly accelerated information dissemination. However,\nthis transformation has introduced complexities in information capture and\nprocessing, posing substantial challenges for researchers and practitioners.\nPredicting the dissemination of topic-related information within social\nnetworks has thus become a critical research focus. This paper proposes a\npredictive model for topic dissemination in social networks by integrating\nmultidimensional features derived from key dissemination characteristics.\nSpecifically, we introduce two novel indicators, user relationship breadth and\nuser authority, into the PageRank algorithm to quantify user influence more\neffectively. Additionally, we employ a Text-CNN model for sentiment\nclassification, extracting sentiment features from textual content. Temporal\nembeddings of nodes are encoded using a Bi-LSTM model to capture temporal\ndynamics. Furthermore, we refine the measurement of user interaction traces\nwith topics, replacing traditional topic view metrics with a more precise\ncommunication characteristics measure. Finally, we integrate the extracted\nmultidimensional features using a Transformer model, significantly enhancing\npredictive performance. Experimental results demonstrate that our proposed\nmodel outperforms traditional machine learning and unimodal deep learning\nmodels in terms of FI-Score, AUC, and Recall, validating its effectiveness in\npredicting topic propagation within social networks.\n","authors":["Yuchuan Jiang","Chaolong Jia","Yunyi Qin","Wei Cai","Yongsen Qian"],"pdf_url":"https://arxiv.org/pdf/2503.03112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.18084v2","updated":"2025-03-05T02:08:32Z","published":"2024-12-24T01:48:07Z","title":"Property Enhanced Instruction Tuning for Multi-task Molecule Generation\n with Large Language Models","summary":" Large language models (LLMs) are widely applied in various natural language\nprocessing tasks such as question answering and machine translation. However,\ndue to the lack of labeled data and the difficulty of manual annotation for\nbiochemical properties, the performance for molecule generation tasks is still\nlimited, especially for tasks involving multi-properties constraints. In this\nwork, we present a two-step framework PEIT (Property Enhanced Instruction\nTuning) to improve LLMs for molecular-related tasks. In the first step, we use\ntextual descriptions, SMILES, and biochemical properties as multimodal inputs\nto pre-train a model called PEIT-GEN, by aligning multi-modal representations\nto synthesize instruction data. In the second step, we fine-tune existing\nopen-source LLMs with the synthesized data, the resulting PEIT-LLM can handle\nmolecule captioning, text-based molecule generation, molecular property\nprediction, and our newly proposed multi-constraint molecule generation tasks.\nExperimental results show that our pre-trained PEIT-GEN outperforms MolT5 and\nBioT5 in molecule captioning, demonstrating modalities align well between\ntextual descriptions, structures, and biochemical properties. Furthermore,\nPEIT-LLM shows promising improvements in multi-task molecule generation,\nproving the scalability of the PEIT framework for various molecular tasks. We\nrelease the code, constructed instruction data, and model checkpoints in\nhttps://github.com/chenlong164/PEIT.\n","authors":["Xuan Lin","Long Chen","Yile Wang","Xiangxiang Zeng","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2412.18084v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03108v1","updated":"2025-03-05T02:08:12Z","published":"2025-03-05T02:08:12Z","title":"SoK: Knowledge is All You Need: Last Mile Delivery for Automated\n Provenance-based Intrusion Detection with LLMs","summary":" Recently, provenance-based intrusion detection systems (PIDSes) have been\nwidely proposed for endpoint threat analysis. However, due to the lack of\nsystematic integration and utilization of knowledge, existing PIDSes still\nrequire significant manual intervention for practical deployment, making full\nautomation challenging. This paper presents a disruptive innovation by\ncategorizing PIDSes according to the types of knowledge they utilize. In\nresponse to the prevalent issue of ``knowledge silos problem'' in existing\nresearch, we introduce a novel knowledge-driven provenance-based intrusion\ndetection framework, powered by large language models (LLMs). We also present\nOmniSec, a best practice system built upon this framework. By integrating\nattack representation knowledge, threat intelligence knowledge, and benign\nbehavior knowledge, OmniSec outperforms the state-of-the-art approaches on\npublic benchmark datasets. OmniSec is available online at\nhttps://anonymous.4open.science/r/PIDS-with-LLM-613B.\n","authors":["Wenrui Cheng","Tiantian Zhu","Chunlin Xiong","Haofei Sun","Zijun Wang","Shunan Jing","Mingqi Lv","Yan Chen"],"pdf_url":"https://arxiv.org/pdf/2503.03108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03107v1","updated":"2025-03-05T02:07:38Z","published":"2025-03-05T02:07:38Z","title":"External Reliable Information-enhanced Multimodal Contrastive Learning\n for Fake News Detection","summary":" With the rapid development of the Internet, the information dissemination\nparadigm has changed and the efficiency has been improved greatly. While this\nalso brings the quick spread of fake news and leads to negative impacts on\ncyberspace. Currently, the information presentation formats have evolved\ngradually, with the news formats shifting from texts to multimodal contents. As\na result, detecting multimodal fake news has become one of the research\nhotspots. However, multimodal fake news detection research field still faces\ntwo main challenges: the inability to fully and effectively utilize multimodal\ninformation for detection, and the low credibility or static nature of the\nintroduced external information, which limits dynamic updates. To bridge the\ngaps, we propose ERIC-FND, an external reliable information-enhanced multimodal\ncontrastive learning framework for fake news detection. ERIC-FND strengthens\nthe representation of news contents by entity-enriched external information\nenhancement method. It also enriches the multimodal news information via\nmultimodal semantic interaction method where the multimodal constrative\nlearning is employed to make different modality representations learn from each\nother. Moreover, an adaptive fusion method is taken to integrate the news\nrepresentations from different dimensions for the eventual classification.\nExperiments are done on two commonly used datasets in different languages, X\n(Twitter) and Weibo. Experiment results demonstrate that our proposed model\nERIC-FND outperforms existing state-of-the-art fake news detection methods\nunder the same settings.\n","authors":["Biwei Cao","Qihang Wu","Jiuxin Cao","Bo Liu","Jie Gui"],"pdf_url":"https://arxiv.org/pdf/2503.03107v1.pdf","comment":"accepted by AAAI'25"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2503.03750v1","updated":"2025-03-05T18:59:23Z","published":"2025-03-05T18:59:23Z","title":"The MASK Benchmark: Disentangling Honesty From Accuracy in AI Systems","summary":" As large language models (LLMs) become more capable and agentic, the\nrequirement for trust in their outputs grows significantly, yet at the same\ntime concerns have been mounting that models may learn to lie in pursuit of\ntheir goals. To address these concerns, a body of work has emerged around the\nnotion of \"honesty\" in LLMs, along with interventions aimed at mitigating\ndeceptive behaviors. However, evaluations of honesty are currently highly\nlimited, with no benchmark combining large scale and applicability to all\nmodels. Moreover, many benchmarks claiming to measure honesty in fact simply\nmeasure accuracy--the correctness of a model's beliefs--in disguise. In this\nwork, we introduce a large-scale human-collected dataset for measuring honesty\ndirectly, allowing us to disentangle accuracy from honesty for the first time.\nAcross a diverse set of LLMs, we find that while larger models obtain higher\naccuracy on our benchmark, they do not become more honest. Surprisingly, while\nmost frontier LLMs obtain high scores on truthfulness benchmarks, we find a\nsubstantial propensity in frontier LLMs to lie when pressured to do so,\nresulting in low honesty scores on our benchmark. We find that simple methods,\nsuch as representation engineering interventions, can improve honesty. These\nresults underscore the growing need for robust evaluations and effective\ninterventions to ensure LLMs remain trustworthy.\n","authors":["Richard Ren","Arunim Agarwal","Mantas Mazeika","Cristina Menghini","Robert Vacareanu","Brad Kenstler","Mick Yang","Isabelle Barrass","Alice Gatti","Xuwang Yin","Eduardo Trevino","Matias Geralnik","Adam Khoja","Dean Lee","Summer Yue","Dan Hendrycks"],"pdf_url":"https://arxiv.org/pdf/2503.03750v1.pdf","comment":"Website: https://www.mask-benchmark.ai"},{"id":"http://arxiv.org/abs/2503.01048v3","updated":"2025-03-05T18:59:19Z","published":"2025-03-02T22:40:10Z","title":"Personalize Your LLM: Fake it then Align it","summary":" Personalizing large language models (LLMs) is essential for delivering\ntailored interactions that improve user experience. Many existing\npersonalization methods require fine-tuning LLMs for each user, rendering them\nprohibitively expensive for widespread adoption. Although retrieval-based\napproaches offer a more compute-efficient alternative, they still depend on\nlarge, high-quality datasets that are not consistently available for all users.\nTo address this challenge, we propose CHAMELEON, a scalable and efficient\npersonalization approach that uses (1) self-generated personal preference data\nand (2) representation editing to enable quick and cost-effective\npersonalization. Our experiments on various tasks, including those from the\nLaMP personalization benchmark, show that CHAMELEON efficiently adapts models\nto personal preferences, improving instruction-tuned models and outperforms two\npersonalization baselines by an average of 40% across two model architectures.\n","authors":["Yijing Zhang","Dyah Adila","Changho Shin","Frederic Sala"],"pdf_url":"https://arxiv.org/pdf/2503.01048v3.pdf","comment":"NAACL 2025 Findings"},{"id":"http://arxiv.org/abs/2503.03747v1","updated":"2025-03-05T18:58:58Z","published":"2025-03-05T18:58:58Z","title":"PacketCLIP: Multi-Modal Embedding of Network Traffic and Language for\n Cybersecurity Reasoning","summary":" Traffic classification is vital for cybersecurity, yet encrypted traffic\nposes significant challenges. We present PacketCLIP, a multi-modal framework\ncombining packet data with natural language semantics through contrastive\npretraining and hierarchical Graph Neural Network (GNN) reasoning. PacketCLIP\nintegrates semantic reasoning with efficient classification, enabling robust\ndetection of anomalies in encrypted network flows. By aligning textual\ndescriptions with packet behaviors, it offers enhanced interpretability,\nscalability, and practical applicability across diverse security scenarios.\nPacketCLIP achieves a 95% mean AUC, outperforms baselines by 11.6%, and reduces\nmodel size by 92%, making it ideal for real-time anomaly detection. By bridging\nadvanced machine learning techniques and practical cybersecurity needs,\nPacketCLIP provides a foundation for scalable, efficient, and interpretable\nsolutions to tackle encrypted traffic classification and network intrusion\ndetection challenges in resource-constrained environments.\n","authors":["Ryozo Masukawa","Sanggeon Yun","Sungheon Jeong","Wenjun Huang","Yang Ni","Ian Bryant","Nathaniel D. Bastian","Mohsen Imani"],"pdf_url":"https://arxiv.org/pdf/2503.03747v1.pdf","comment":"7 pages, 7 figures"},{"id":"http://arxiv.org/abs/2503.03744v1","updated":"2025-03-05T18:56:48Z","published":"2025-03-05T18:56:48Z","title":"Constrained Gaussian Wasserstein Optimal Transport with Commutative\n Covariance Matrices","summary":" Optimal transport has found widespread applications in signal processing and\nmachine learning. Among its many equivalent formulations, optimal transport\nseeks to reconstruct a random variable/vector with a prescribed distribution at\nthe destination while minimizing the expected distortion relative to a given\nrandom variable/vector at the source. However, in practice, certain constraints\nmay render the optimal transport plan infeasible. In this work, we consider\nthree types of constraints: rate constraints, dimension constraints, and\nchannel constraints, motivated by perception-aware lossy compression,\ngenerative principal component analysis, and deep joint source-channel coding,\nrespectively. Special attenion is given to the setting termed Gaussian\nWasserstein optimal transport, where both the source and reconstruction\nvariables are multivariate Gaussian, and the end-to-end distortion is measured\nby the mean squared error. We derive explicit results for the minimum\nachievable mean squared error under the three aforementioned constraints when\nthe covariance matrices of the source and reconstruction variables commute.\n","authors":["Jun Chen","Jia Wang","Ruibin Li","Han Zhou","Wei Dong","Huan Liu","Yuanhao Yu"],"pdf_url":"https://arxiv.org/pdf/2503.03744v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03736v1","updated":"2025-03-05T18:44:56Z","published":"2025-03-05T18:44:56Z","title":"Opportunistic Routing in Wireless Communications via Learnable\n State-Augmented Policies","summary":" This paper addresses the challenge of packet-based information routing in\nlarge-scale wireless communication networks. The problem is framed as a\nconstrained statistical learning task, where each network node operates using\nonly local information. Opportunistic routing exploits the broadcast nature of\nwireless communication to dynamically select optimal forwarding nodes, enabling\nthe information to reach the destination through multiple relay nodes\nsimultaneously. To solve this, we propose a State-Augmentation (SA) based\ndistributed optimization approach aimed at maximizing the total information\nhandled by the source nodes in the network. The problem formulation leverages\nGraph Neural Networks (GNNs), which perform graph convolutions based on the\ntopological connections between network nodes. Using an unsupervised learning\nparadigm, we extract routing policies from the GNN architecture, enabling\noptimal decisions for source nodes across various flows. Numerical experiments\ndemonstrate that the proposed method achieves superior performance when\ntraining a GNN-parameterized model, particularly when compared to baseline\nalgorithms. Additionally, applying the method to real-world network topologies\nand wireless ad-hoc network test beds validates its effectiveness, highlighting\nthe robustness and transferability of GNNs.\n","authors":["Sourajit Das","Navid NaderiAlizadeh","Rahul Mangharam","Alejandro Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2503.03736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03730v1","updated":"2025-03-05T18:40:19Z","published":"2025-03-05T18:40:19Z","title":"Towards Understanding Distilled Reasoning Models: A Representational\n Approach","summary":" In this paper, we investigate how model distillation impacts the development\nof reasoning features in large language models (LLMs). To explore this, we\ntrain a crosscoder on Qwen-series models and their fine-tuned variants. Our\nresults suggest that the crosscoder learns features corresponding to various\ntypes of reasoning, including self-reflection and computation verification.\nMoreover, we observe that distilled models contain unique reasoning feature\ndirections, which could be used to steer the model into over-thinking or\nincisive-thinking mode. In particular, we perform analysis on four specific\nreasoning categories: (a) self-reflection, (b) deductive reasoning, (c)\nalternative reasoning, and (d) contrastive reasoning. Finally, we examine the\nchanges in feature geometry resulting from the distillation process and find\nindications that larger distilled models may develop more structured\nrepresentations, which correlate with enhanced distillation performance. By\nproviding insights into how distillation modifies the model, our study\ncontributes to enhancing the transparency and reliability of AI systems.\n","authors":["David D. Baek","Max Tegmark"],"pdf_url":"https://arxiv.org/pdf/2503.03730v1.pdf","comment":"13 pages, 11 figures"},{"id":"http://arxiv.org/abs/2503.03729v1","updated":"2025-03-05T18:37:52Z","published":"2025-03-05T18:37:52Z","title":"Graph-Augmented LSTM for Forecasting Sparse Anomalies in\n Graph-Structured Time Series","summary":" Detecting anomalies in time series data is a critical task across many\ndomains. The challenge intensifies when anomalies are sparse and the data are\nmultivariate with relational dependencies across sensors or nodes. Traditional\nunivariate anomaly detectors struggle to capture such cross-node dependencies,\nparticularly in sparse anomaly settings. To address this, we propose a\ngraph-augmented time series forecasting approach that explicitly integrates the\ngraph of relationships among time series into an LSTM forecasting model. This\nenables the model to detect rare anomalies that might otherwise go unnoticed in\npurely univariate approaches. We evaluate the approach on two benchmark\ndatasets - the Yahoo Webscope S5 anomaly dataset and the METR-LA traffic sensor\nnetwork - and compare the performance of the Graph-Augmented LSTM against\nLSTM-only, ARIMA, and Prophet baselines. Results demonstrate that the\ngraph-augmented model achieves significantly higher precision and recall,\nimproving F1-score by up to 10% over the best baseline\n","authors":["Sneh Pillai"],"pdf_url":"https://arxiv.org/pdf/2503.03729v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2410.09156v3","updated":"2025-03-05T18:36:02Z","published":"2024-10-11T18:02:46Z","title":"On Discriminative Probabilistic Modeling for Self-Supervised\n Representation Learning","summary":" We study the discriminative probabilistic modeling on a continuous domain for\nthe data prediction task of (multimodal) self-supervised representation\nlearning. To address the challenge of computing the integral in the partition\nfunction for each anchor data, we leverage the multiple importance sampling\n(MIS) technique for robust Monte Carlo integration, which can recover\nInfoNCE-based contrastive loss as a special case. Within this probabilistic\nmodeling framework, we conduct generalization error analysis to reveal the\nlimitation of current InfoNCE-based contrastive loss for self-supervised\nrepresentation learning and derive insights for developing better approaches by\nreducing the error of Monte Carlo integration. To this end, we propose a novel\nnon-parametric method for approximating the sum of conditional probability\ndensities required by MIS through convex optimization, yielding a new\ncontrastive objective for self-supervised representation learning. Moreover, we\ndesign an efficient algorithm for solving the proposed objective. We\nempirically compare our algorithm to representative baselines on the\ncontrastive image-language pretraining task. Experimental results on the CC3M\nand CC12M datasets demonstrate the superior overall performance of our\nalgorithm. Our code is available at https://github.com/bokun-wang/NUCLR.\n","authors":["Bokun Wang","Yunwen Lei","Yiming Ying","Tianbao Yang"],"pdf_url":"https://arxiv.org/pdf/2410.09156v3.pdf","comment":"To appear in ICLR 2025"},{"id":"http://arxiv.org/abs/2503.03724v1","updated":"2025-03-05T18:24:58Z","published":"2025-03-05T18:24:58Z","title":"Deep Causal Behavioral Policy Learning: Applications to Healthcare","summary":" We present a deep learning-based approach to studying dynamic clinical\nbehavioral regimes in diverse non-randomized healthcare settings. Our proposed\nmethodology - deep causal behavioral policy learning (DC-BPL) - uses deep\nlearning algorithms to learn the distribution of high-dimensional clinical\naction paths, and identifies the causal link between these action paths and\npatient outcomes. Specifically, our approach: (1) identifies the causal effects\nof provider assignment on clinical outcomes; (2) learns the distribution of\nclinical actions a given provider would take given evolving patient\ninformation; (3) and combines these steps to identify the optimal provider for\na given patient type and emulate that provider's care decisions. Underlying\nthis strategy, we train a large clinical behavioral model (LCBM) on electronic\nhealth records data using a transformer architecture, and demonstrate its\nability to estimate clinical behavioral policies. We propose a novel\ninterpretation of a behavioral policy learned using the LCBM: that it is an\nefficient encoding of complex, often implicit, knowledge used to treat a\npatient. This allows us to learn a space of policies that are critical to a\nwide range of healthcare applications, in which the vast majority of clinical\nknowledge is acquired tacitly through years of practice and only a tiny\nfraction of information relevant to patient care is written down (e.g. in\ntextbooks, studies or standardized guidelines).\n","authors":["Jonas Knecht","Anna Zink","Jonathan Kolstad","Maya Petersen"],"pdf_url":"https://arxiv.org/pdf/2503.03724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14395v2","updated":"2025-03-05T18:17:28Z","published":"2024-04-22T17:55:56Z","title":"PARAMANU-GANITA: Can Small Math Language Models Rival with Large\n Language Models on Mathematical Reasoning?","summary":" In this paper, we study whether domain specific pretraining of small\ngenerative language models (SLM) from scratch with domain specialized tokenizer\nand Chain-of-Thought (CoT) instruction fine-tuning results in competitive\nperformance on mathematical reasoning compared to LLMs? Secondly, whether this\napproach is environmentally sustainable, highly cost efficient? To address\nthese research questions, we present Paramanu-Ganita, a 208 million-parameter\nnovel decoder-only Auto Regressive SLM on mathematics. We performed pretraining\nfrom scratch on 31.5 billion tokens for 170 A100 hours using a context size of\n4096 on a mixed mathematical corpus consisting of web pages, source code,\ntextbooks, CoT templatised StackOverflow QA pairs, and mathematical lecture\nnotes in LaTeX curated by us. We also trained a math and code specialised BPE\ntokenizer. We proposed and performed CoT instruction fine-tuning of\nParamanu-Ganita on the MetaMathQA dataset. Our model Paramanu-Ganita, despite\nbeing 34 times smaller than the 7B LLMs, outperforms generalist LLMs by\napproximately 30% points, and even math-specialised LLMs by 3-23% points in\nGSM8K test accuracy metric. On MATH benchmark, Paramanu-Ganita outperformed the\nvarious models by 6-8% points. On benchmarks like LogiQA, MMLU (high school,\ncollege level), and competitive exams level, AGIEVAL (AQuA-RAT, SAT-Math),\nParamanu-Ganita outperformed others by 1-4%. Our model is available at\nhttps://huggingface.co/gyanai/paramanu-ganita-208M-hf .\n","authors":["Mitodru Niyogi","Arnab Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2404.14395v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00675v2","updated":"2025-03-05T18:14:25Z","published":"2024-03-01T17:08:30Z","title":"Reusing Historical Trajectories in Natural Policy Gradient via\n Importance Sampling: Convergence and Convergence Rate","summary":" Reinforcement learning provides a mathematical framework for learning-based\ncontrol, whose success largely depends on the amount of data it can utilize.\nThe efficient utilization of historical trajectories obtained from previous\npolicies is essential for expediting policy optimization. Empirical evidence\nhas shown that policy gradient methods based on importance sampling work well.\nHowever, existing literature often neglect the interdependence between\ntrajectories from different iterations, and the good empirical performance\nlacks a rigorous theoretical justification. In this paper, we study a variant\nof the natural policy gradient method with reusing historical trajectories via\nimportance sampling. We show that the bias of the proposed estimator of the\ngradient is asymptotically negligible, the resultant algorithm is convergent,\nand reusing past trajectories helps improve the convergence rate. We further\napply the proposed estimator to popular policy optimization algorithms such as\ntrust region policy optimization. Our theoretical results are verified on\nclassical benchmarks.\n","authors":["Yifan Lin","Yuhao Wang","Enlu Zhou"],"pdf_url":"https://arxiv.org/pdf/2403.00675v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03888v3","updated":"2025-03-05T18:04:40Z","published":"2025-01-07T15:51:49Z","title":"Neural DNF-MT: A Neuro-symbolic Approach for Learning Interpretable and\n Editable Policies","summary":" Although deep reinforcement learning has been shown to be effective, the\nmodel's black-box nature presents barriers to direct policy interpretation. To\naddress this problem, we propose a neuro-symbolic approach called neural DNF-MT\nfor end-to-end policy learning. The differentiable nature of the neural DNF-MT\nmodel enables the use of deep actor-critic algorithms for training. At the same\ntime, its architecture is designed so that trained models can be directly\ntranslated into interpretable policies expressed as standard (bivalent or\nprobabilistic) logic programs. Moreover, additional layers can be included to\nextract abstract features from complex observations, acting as a form of\npredicate invention. The logic representations are highly interpretable, and we\nshow how the bivalent representations of deterministic policies can be edited\nand incorporated back into a neural model, facilitating manual intervention and\nadaptation of learned policies. We evaluate our approach on a range of tasks\nrequiring learning deterministic or stochastic behaviours from various forms of\nobservations. Our empirical results show that our neural DNF-MT model performs\nat the level of competing black-box methods whilst providing interpretable\npolicies.\n","authors":["Kexin Gu Baugh","Luke Dickens","Alessandra Russo"],"pdf_url":"https://arxiv.org/pdf/2501.03888v3.pdf","comment":"AAMAS 2025 (with Appendix)"},{"id":"http://arxiv.org/abs/2503.03715v1","updated":"2025-03-05T18:04:30Z","published":"2025-03-05T18:04:30Z","title":"Handling Uncertainty in Health Data using Generative Algorithms","summary":" Understanding and managing uncertainty is crucial in machine learning,\nespecially in high-stakes domains like healthcare, where class imbalance can\nimpact predictions. This paper introduces RIGA, a novel pipeline that mitigates\nclass imbalance using generative AI. By converting tabular healthcare data into\nimages, RIGA leverages models like cGAN, VQVAE, and VQGAN to generate balanced\nsamples, improving classification performance. These representations are\nprocessed by CNNs and later transformed back into tabular format for seamless\nintegration. This approach enhances traditional classifiers like XGBoost,\nimproves Bayesian structure learning, and strengthens ML model robustness by\ngenerating realistic synthetic data for underrepresented classes.\n","authors":["Mahdi Arab Loodaricheh","Neh Majmudar","Anita Raja","Ansaf Salleb-Aouissi"],"pdf_url":"https://arxiv.org/pdf/2503.03715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03710v1","updated":"2025-03-05T18:01:05Z","published":"2025-03-05T18:01:05Z","title":"Improving LLM Safety Alignment with Dual-Objective Optimization","summary":" Existing training-time safety alignment techniques for large language models\n(LLMs) remain vulnerable to jailbreak attacks. Direct preference optimization\n(DPO), a widely deployed alignment method, exhibits limitations in both\nexperimental and theoretical contexts as its loss function proves suboptimal\nfor refusal learning. Through gradient-based analysis, we identify these\nshortcomings and propose an improved safety alignment that disentangles DPO\nobjectives into two components: (1) robust refusal training, which encourages\nrefusal even when partial unsafe generations are produced, and (2) targeted\nunlearning of harmful knowledge. This approach significantly increases LLM\nrobustness against a wide range of jailbreak attacks, including prefilling,\nsuffix, and multi-turn attacks across both in-distribution and\nout-of-distribution scenarios. Furthermore, we introduce a method to emphasize\ncritical refusal tokens by incorporating a reward-based token-level weighting\nmechanism for refusal learning, which further improves the robustness against\nadversarial exploits. Our research also suggests that robustness to jailbreak\nattacks is correlated with token distribution shifts in the training process\nand internal representations of refusal and harmful tokens, offering valuable\ndirections for future research in LLM safety alignment. The code is available\nat https://github.com/wicai24/DOOR-Alignment\n","authors":["Xuandong Zhao","Will Cai","Tianneng Shi","David Huang","Licong Lin","Song Mei","Dawn Song"],"pdf_url":"https://arxiv.org/pdf/2503.03710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03707v1","updated":"2025-03-05T17:58:16Z","published":"2025-03-05T17:58:16Z","title":"Curating Demonstrations using Online Experience","summary":" Many robot demonstration datasets contain heterogeneous demonstrations of\nvarying quality. This heterogeneity may benefit policy pre-training, but can\nhinder robot performance when used with a final imitation learning objective.\nIn particular, some strategies in the data may be less reliable than others or\nmay be underrepresented in the data, leading to poor performance when such\nstrategies are sampled at test time. Moreover, such unreliable or\nunderrepresented strategies can be difficult even for people to discern, and\nsifting through demonstration datasets is time-consuming and costly. On the\nother hand, policy performance when trained on such demonstrations can reflect\nthe reliability of different strategies. We thus propose for robots to\nself-curate based on online robot experience (Demo-SCORE). More specifically,\nwe train and cross-validate a classifier to discern successful policy roll-outs\nfrom unsuccessful ones and use the classifier to filter heterogeneous\ndemonstration datasets. Our experiments in simulation and the real world show\nthat Demo-SCORE can effectively identify suboptimal demonstrations without\nmanual curation. Notably, Demo-SCORE achieves over 15-35% higher absolute\nsuccess rate in the resulting policy compared to the base policy trained with\nall original demonstrations.\n","authors":["Annie S. Chen","Alec M. Lessing","Yuejiang Liu","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2503.03707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03705v1","updated":"2025-03-05T17:56:20Z","published":"2025-03-05T17:56:20Z","title":"Effective LLM Knowledge Learning via Model Generalization","summary":" Large language models (LLMs) are trained on enormous documents that contain\nextensive world knowledge. However, it is still not well-understood how\nknowledge is acquired via autoregressive pre-training. This lack of\nunderstanding greatly hinders effective knowledge learning, especially for\ncontinued pretraining on up-to-date information, as this evolving information\noften lacks diverse repetitions like foundational knowledge. In this paper, we\nfocus on understanding and improving LLM knowledge learning. We found and\nverified that knowledge learning for LLMs can be deemed as an implicit\nsupervised task hidden in the autoregressive pre-training objective. Our\nfindings suggest that knowledge learning for LLMs would benefit from methods\ndesigned to improve generalization ability for supervised tasks. Based on our\nanalysis, we propose the formatting-based data augmentation to grow\nin-distribution samples, which does not present the risk of altering the facts\nembedded in documents as text paraphrasing. We also introduce sharpness-aware\nminimization as an effective optimization algorithm to better improve\ngeneralization. Moreover, our analysis and method can be readily extended to\ninstruction tuning. Extensive experiment results validate our findings and\ndemonstrate our methods' effectiveness in both continued pre-training and\ninstruction tuning. This paper offers new perspectives and insights to\ninterpret and design effective strategies for LLM knowledge learning.\n","authors":["Mingkang Zhu","Xi Chen","Zhongdao Wang","Bei Yu","Hengshuang Zhao","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2503.03705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03704v1","updated":"2025-03-05T17:53:24Z","published":"2025-03-05T17:53:24Z","title":"A Practical Memory Injection Attack against LLM Agents","summary":" Agents based on large language models (LLMs) have demonstrated strong\ncapabilities in a wide range of complex, real-world applications. However, LLM\nagents with a compromised memory bank may easily produce harmful outputs when\nthe past records retrieved for demonstration are malicious. In this paper, we\npropose a novel Memory INJection Attack, MINJA, that enables the injection of\nmalicious records into the memory bank by only interacting with the agent via\nqueries and output observations. These malicious records are designed to elicit\na sequence of malicious reasoning steps leading to undesirable agent actions\nwhen executing the victim user's query. Specifically, we introduce a sequence\nof bridging steps to link the victim query to the malicious reasoning steps.\nDuring the injection of the malicious record, we propose an indication prompt\nto guide the agent to autonomously generate our designed bridging steps. We\nalso propose a progressive shortening strategy that gradually removes the\nindication prompt, such that the malicious record will be easily retrieved when\nprocessing the victim query comes after. Our extensive experiments across\ndiverse agents demonstrate the effectiveness of MINJA in compromising agent\nmemory. With minimal requirements for execution, MINJA enables any user to\ninfluence agent memory, highlighting practical risks of LLM agents.\n","authors":["Shen Dong","Shaocheng Xu","Pengfei He","Yige Li","Jiliang Tang","Tianming Liu","Hui Liu","Zhen Xiang"],"pdf_url":"https://arxiv.org/pdf/2503.03704v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.01776v2","updated":"2025-03-05T17:51:09Z","published":"2025-03-03T17:59:48Z","title":"Beyond Matryoshka: Revisiting Sparse Coding for Adaptive Representation","summary":" Many large-scale systems rely on high-quality deep representations\n(embeddings) to facilitate tasks like retrieval, search, and generative\nmodeling. Matryoshka Representation Learning (MRL) recently emerged as a\nsolution for adaptive embedding lengths, but it requires full model retraining\nand suffers from noticeable performance degradations at short lengths. In this\npaper, we show that sparse coding offers a compelling alternative for achieving\nadaptive representation with minimal overhead and higher fidelity. We propose\nContrastive Sparse Representation (CSR), a method that sparsifies pre-trained\nembeddings into a high-dimensional but selectively activated feature space. By\nleveraging lightweight autoencoding and task-aware contrastive objectives, CSR\npreserves semantic quality while allowing flexible, cost-effective inference at\ndifferent sparsity levels. Extensive experiments on image, text, and multimodal\nbenchmarks demonstrate that CSR consistently outperforms MRL in terms of both\naccuracy and retrieval speed-often by large margins-while also cutting training\ntime to a fraction of that required by MRL. Our results establish sparse coding\nas a powerful paradigm for adaptive representation learning in real-world\napplications where efficiency and fidelity are both paramount. Code is\navailable at https://github.com/neilwen987/CSR_Adaptive_Rep\n","authors":["Tiansheng Wen","Yifei Wang","Zequn Zeng","Zhong Peng","Yudi Su","Xinyang Liu","Bo Chen","Hongwei Liu","Stefanie Jegelka","Chenyu You"],"pdf_url":"https://arxiv.org/pdf/2503.01776v2.pdf","comment":"A novel sparse coding framework designed for learning adaptive\n representation"},{"id":"http://arxiv.org/abs/2503.03684v1","updated":"2025-03-05T17:25:20Z","published":"2025-03-05T17:25:20Z","title":"Towards Trustworthy Federated Learning","summary":" This paper develops a comprehensive framework to address three critical\ntrustworthy challenges in federated learning (FL): robustness against Byzantine\nattacks, fairness, and privacy preservation. To improve the system's defense\nagainst Byzantine attacks that send malicious information to bias the system's\nperformance, we develop a Two-sided Norm Based Screening (TNBS) mechanism,\nwhich allows the central server to crop the gradients that have the l lowest\nnorms and h highest norms. TNBS functions as a screening tool to filter out\npotential malicious participants whose gradients are far from the honest ones.\nTo promote egalitarian fairness, we adopt the q-fair federated learning\n(q-FFL). Furthermore, we adopt a differential privacy-based scheme to prevent\nraw data at local clients from being inferred by curious parties. Convergence\nguarantees are provided for the proposed framework under different scenarios.\nExperimental results on real datasets demonstrate that the proposed framework\neffectively improves robustness and fairness while managing the trade-off\nbetween privacy and accuracy. This work appears to be the first study that\nexperimentally and theoretically addresses fairness, privacy, and robustness in\ntrustworthy FL.\n","authors":["Alina Basharat","Yijun Bian","Ping Xu","Zhi Tian"],"pdf_url":"https://arxiv.org/pdf/2503.03684v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.01777v2","updated":"2025-03-05T17:25:07Z","published":"2025-02-03T19:29:42Z","title":"CTC-DRO: Robust Optimization for Reducing Language Disparities in Speech\n Recognition","summary":" Modern deep learning models often achieve high overall performance, but\nconsistently fail on specific subgroups. Group distributionally robust\noptimization (group DRO) addresses this problem by minimizing the worst-group\nloss, but it fails when group losses misrepresent performance differences\nbetween groups. This is common in domains like speech, where the widely used\nconnectionist temporal classification (CTC) loss scales with input length and\nvaries with linguistic and acoustic properties, leading to spurious differences\nbetween group losses. We present CTC-DRO, which addresses the shortcomings of\nthe group DRO objective by smoothing the group weight update to prevent\noveremphasis on consistently high-loss groups, while using input length-matched\nbatching to mitigate CTC's scaling issues. We evaluate CTC-DRO on the task of\nmultilingual automatic speech recognition (ASR) across five language sets from\nthe ML-SUPERB 2.0 benchmark. CTC-DRO consistently outperforms group DRO and\nCTC-based baseline models, reducing the worst-language error by up to 47.1% and\nthe average error by up to 32.9%. CTC-DRO can be applied to ASR with minimal\ncomputational costs, and offers the potential for reducing group disparities in\nother domains with similar challenges.\n","authors":["Martijn Bartelds","Ananjan Nandi","Moussa Koulako Bala Doumbouya","Dan Jurafsky","Tatsunori Hashimoto","Karen Livescu"],"pdf_url":"https://arxiv.org/pdf/2502.01777v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03676v1","updated":"2025-03-05T17:11:02Z","published":"2025-03-05T17:11:02Z","title":"Optimally Installing Strict Equilibria","summary":" In this work, we develop a reward design framework for installing a desired\nbehavior as a strict equilibrium across standard solution concepts: dominant\nstrategy equilibrium, Nash equilibrium, correlated equilibrium, and coarse\ncorrelated equilibrium. We also extend our framework to capture the\nMarkov-perfect equivalents of each solution concept. Central to our framework\nis a comprehensive mathematical characterization of strictly installable, based\non the desired solution concept and the behavior's structure. These\ncharacterizations lead to efficient iterative algorithms, which we generalize\nto handle optimization objectives through linear programming. Finally, we\nexplore how our results generalize to bounded rational agents.\n","authors":["Jeremy McMahan","Young Wu","Yudong Chen","Xiaojin Zhu","Qiaomin Xie"],"pdf_url":"https://arxiv.org/pdf/2503.03676v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17579v3","updated":"2025-03-05T17:09:46Z","published":"2024-10-23T06:08:45Z","title":"Bonsai: Gradient-free Graph Distillation for Node Classification","summary":" Graph distillation has emerged as a promising avenue to enable scalable\ntraining of GNNs by compressing the training dataset while preserving essential\ngraph characteristics. Our study uncovers significant shortcomings in current\ngraph distillation techniques. First, the majority of the algorithms\nparadoxically require training on the full dataset to perform distillation.\nSecond, due to their gradient-emulating approach, these methods require fresh\ndistillation for any change in hyperparameters or GNN architecture, limiting\ntheir flexibility and reusability. Finally, they fail to achieve substantial\nsize reduction due to synthesizing fully-connected, edge-weighted graphs. To\naddress these challenges, we present Bonsai, a novel graph distillation method\nempowered by the observation that \\textit{computation trees} form the\nfundamental processing units of message-passing GNNs. Bonsai distills datasets\nby encoding a careful selection of \\textit{exemplar} trees that maximize the\nrepresentation of all computation trees in the training set. This unique\napproach imparts Bonsai as the first linear-time, model-agnostic graph\ndistillation algorithm for node classification that outperforms existing\nbaselines across $6$ real-world datasets on accuracy, while being $22$ times\nfaster on average. Bonsai is grounded in rigorous mathematical guarantees on\nthe adopted approximation strategies making it robust to GNN architectures,\ndatasets, and parameters.\n","authors":["Mridul Gupta","Samyak Jain","Vansh Ramani","Hariprasad Kodamana","Sayan Ranu"],"pdf_url":"https://arxiv.org/pdf/2410.17579v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14131v3","updated":"2025-03-05T17:05:55Z","published":"2024-05-23T03:11:07Z","title":"Statistical Advantages of Perturbing Cosine Router in Mixture of Experts","summary":" The cosine router in Mixture of Experts (MoE) has recently emerged as an\nattractive alternative to the conventional linear router. Indeed, the cosine\nrouter demonstrates favorable performance in image and language tasks and\nexhibits better ability to mitigate the representation collapse issue, which\noften leads to parameter redundancy and limited representation potentials.\nDespite its empirical success, a comprehensive analysis of the cosine router in\nMoE has been lacking. Considering the least square estimation of the cosine\nrouting MoE, we demonstrate that due to the intrinsic interaction of the model\nparameters in the cosine router via some partial differential equations,\nregardless of the structures of the experts, the estimation rates of experts\nand model parameters can be as slow as $\\mathcal{O}(1/\\log^{\\tau}(n))$ where\n$\\tau > 0$ is some constant and $n$ is the sample size. Surprisingly, these\npessimistic non-polynomial convergence rates can be circumvented by the widely\nused technique in practice to stabilize the cosine router -- simply adding\nnoises to the $\\ell^2$-norms in the cosine router, which we refer to as\n\\textit{perturbed cosine router}. Under the strongly identifiable settings of\nthe expert functions, we prove that the estimation rates for both the experts\nand model parameters under the perturbed cosine routing MoE are significantly\nimproved to polynomial rates. Finally, we conduct extensive simulation studies\nin both synthetic and real data settings to empirically validate our\ntheoretical results.\n","authors":["Huy Nguyen","Pedram Akbarian","Trang Pham","Trang Nguyen","Shujian Zhang","Nhat Ho"],"pdf_url":"https://arxiv.org/pdf/2405.14131v3.pdf","comment":"Accepted to ICLR 2025"},{"id":"http://arxiv.org/abs/2503.03666v1","updated":"2025-03-05T16:59:08Z","published":"2025-03-05T16:59:08Z","title":"Analogical Reasoning Inside Large Language Models: Concept Vectors and\n the Limits of Abstraction","summary":" Analogical reasoning relies on conceptual abstractions, but it is unclear\nwhether Large Language Models (LLMs) harbor such internal representations. We\nexplore distilled representations from LLM activations and find that function\nvectors (FVs; Todd et al., 2024) - compact representations for in-context\nlearning (ICL) tasks - are not invariant to simple input changes (e.g.,\nopen-ended vs. multiple-choice), suggesting they capture more than pure\nconcepts. Using representational similarity analysis (RSA), we localize a small\nset of attention heads that encode invariant concept vectors (CVs) for verbal\nconcepts like \"antonym\". These CVs function as feature detectors that operate\nindependently of the final output - meaning that a model may form a correct\ninternal representation yet still produce an incorrect output. Furthermore, CVs\ncan be used to causally guide model behaviour. However, for more abstract\nconcepts like \"previous\" and \"next\", we do not observe invariant linear\nrepresentations, a finding we link to generalizability issues LLMs display\nwithin these domains.\n","authors":["Gustaw Opiełka","Hannes Rosenbusch","Claire E. Stevenson"],"pdf_url":"https://arxiv.org/pdf/2503.03666v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2409.07402v2","updated":"2025-03-05T16:48:23Z","published":"2024-09-11T16:42:22Z","title":"What to align in multimodal contrastive learning?","summary":" Humans perceive the world through multisensory integration, blending the\ninformation of different modalities to adapt their behavior. Contrastive\nlearning offers an appealing solution for multimodal self-supervised learning.\nIndeed, by considering each modality as a different view of the same entity, it\nlearns to align features of different modalities in a shared representation\nspace. However, this approach is intrinsically limited as it only learns shared\nor redundant information between modalities, while multimodal interactions can\narise in other ways. In this work, we introduce CoMM, a Contrastive MultiModal\nlearning strategy that enables the communication between modalities in a single\nmultimodal space. Instead of imposing cross- or intra- modality constraints, we\npropose to align multimodal representations by maximizing the mutual\ninformation between augmented versions of these multimodal features. Our\ntheoretical analysis shows that shared, synergistic and unique terms of\ninformation naturally emerge from this formulation, allowing us to estimate\nmultimodal interactions beyond redundancy. We test CoMM both in a controlled\nand in a series of real-world settings: in the former, we demonstrate that CoMM\neffectively captures redundant, unique and synergistic information between\nmodalities. In the latter, CoMM learns complex multimodal interactions and\nachieves state-of-the-art results on the seven multimodal benchmarks. Code is\navailable at https://github.com/Duplums/CoMM\n","authors":["Benoit Dufumier","Javiera Castillo-Navarro","Devis Tuia","Jean-Philippe Thiran"],"pdf_url":"https://arxiv.org/pdf/2409.07402v2.pdf","comment":"ICLR 2025, 25 pages"},{"id":"http://arxiv.org/abs/2503.03660v1","updated":"2025-03-05T16:47:36Z","published":"2025-03-05T16:47:36Z","title":"Chunking the Critic: A Transformer-based Soft Actor-Critic with N-Step\n Returns","summary":" Soft Actor-Critic (SAC) critically depends on its critic network, which\ntypically evaluates a single state-action pair to guide policy updates. Using\nN-step returns is a common practice to reduce the bias in the target values of\nthe critic. However, using N-step returns can again introduce high variance and\nnecessitates importance sampling, often destabilizing training. Recent\nalgorithms have also explored action chunking-such as direct action repetition\nand movement primitives-to enhance exploration. In this paper, we propose a\nTransformer-based Critic Network for SAC that integrates the N-returns\nframework in a stable and efficient manner. Unlike approaches that perform\nchunking in the actor network, we feed chunked actions into the critic network\nto explore potential performance gains. Our architecture leverages the\nTransformer's ability to process sequential information, facilitating more\nrobust value estimation. Empirical results show that this method not only\nachieves efficient, stable training but also excels in sparse\nreward/multi-phase environments-traditionally a challenge for step-based\nmethods. These findings underscore the promise of combining Transformer-based\ncritics with N-returns to advance reinforcement learning performance\n","authors":["Dong Tian","Ge Li","Hongyi Zhou","Onur Celik","Gerhard Neumann"],"pdf_url":"https://arxiv.org/pdf/2503.03660v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2503.03659v1","updated":"2025-03-05T16:47:08Z","published":"2025-03-05T16:47:08Z","title":"Finite-sample valid prediction of future insurance claims in the\n regression problem","summary":" In the current insurance literature, prediction of insurance claims in the\nregression problem is often performed with a statistical model. This\nmodel-based approach may suffer from several drawbacks: (i) model\nmisspecification, (ii) selection effect, and (iii) lack of finite-sample\nvalidity. This article addresses these three issues simultaneously by employing\nconformal prediction-a general machine learning strategy for valid predictions.\nThe proposed method is both model-free and tuning-parameter-free. It also\nguarantees finite-sample validity at a pre-assigned coverage probability level.\n","authors":["Liang Hong"],"pdf_url":"https://arxiv.org/pdf/2503.03659v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03656v1","updated":"2025-03-05T16:39:04Z","published":"2025-03-05T16:39:04Z","title":"Robust Learning of Diverse Code Edits","summary":" Software engineering activities frequently involve edits to existing code.\nHowever, contemporary code language models (LMs) lack the ability to handle\ndiverse types of code-edit requirements. In this work, we attempt to overcome\nthis shortcoming through (1) a novel synthetic data generation pipeline and (2)\na robust model adaptation algorithm. Starting with seed code examples and\ndiverse editing criteria, our pipeline generates high-quality samples\ncomprising original and modified code, along with natural language instructions\nin different styles and verbosity. Today's code LMs come bundled with strong\nabilities, such as code generation and instruction following, which should not\nbe lost due to fine-tuning. To ensure this, we propose a novel adaptation\nalgorithm, SeleKT, that (a) leverages a dense gradient-based step to identify\nthe weights that are most important for code editing, and (b) does a sparse\nprojection onto the base model to avoid overfitting. Using our approach, we\nobtain a new series of models NextCoder (adapted from QwenCoder-2.5) that\nachieves strong results on five code-editing benchmarks, outperforming\ncomparable size models and even several larger ones. We show the generality of\nour approach on two model families (DeepSeekCoder and QwenCoder), compare\nagainst other fine-tuning approaches, and demonstrate robustness by showing\nretention of code generation abilities post adaptation.\n","authors":["Tushar Aggarwal","Swayam Singh","Abhijeet Awasthi","Aditya Kanade","Nagarajan Natarajan"],"pdf_url":"https://arxiv.org/pdf/2503.03656v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00816v2","updated":"2025-03-05T16:36:05Z","published":"2024-10-28T08:10:21Z","title":"CycleResearcher: Improving Automated Research via Automated Review","summary":" The automation of scientific discovery has been a long-standing goal within\nthe research community, driven by the potential to accelerate knowledge\ncreation. While significant progress has been made using commercial large\nlanguage models (LLMs) as research assistants or idea generators, the\npossibility of automating the entire research process with open-source LLMs\nremains largely unexplored. This paper explores the feasibility of using\nopen-source post-trained LLMs as autonomous agents capable of performing the\nfull cycle of automated research and review, from literature review and\nmanuscript preparation to peer review and paper refinement. Our iterative\npreference training framework consists of CycleResearcher, which conducts\nresearch tasks, and CycleReviewer, which simulates the peer review process,\nproviding iterative feedback via reinforcement learning. To train these models,\nwe develop two new datasets, Review-5k and Research-14k, reflecting real-world\nmachine learning research and peer review dynamics. Our results demonstrate\nthat CycleReviewer achieves promising performance with a 26.89\\% reduction in\nmean absolute error (MAE) compared to individual human reviewers in predicting\npaper scores, indicating the potential of LLMs to effectively assist\nexpert-level research evaluation. In research, the papers generated by the\nCycleResearcher model achieved a score of 5.36 in simulated peer reviews,\nshowing some competitiveness in terms of simulated review scores compared to\nthe preprint level of 5.24 from human experts, while still having room for\nimprovement compared to the accepted paper level of 5.69. This work represents\na significant step toward fully automated scientific inquiry, providing ethical\nsafeguards and exploring AI-driven research capabilities. The code, dataset and\nmodel weight are released at https://wengsyx.github.io/Researcher/\n","authors":["Yixuan Weng","Minjun Zhu","Guangsheng Bao","Hongbo Zhang","Jindong Wang","Yue Zhang","Linyi Yang"],"pdf_url":"https://arxiv.org/pdf/2411.00816v2.pdf","comment":"Accept in ICLR 2025"},{"id":"http://arxiv.org/abs/2503.03654v1","updated":"2025-03-05T16:32:47Z","published":"2025-03-05T16:32:47Z","title":"Improving Neutral Point of View Text Generation through\n Parameter-Efficient Reinforcement Learning and a Small-Scale High-Quality\n Dataset","summary":" This paper describes the construction of a dataset and the evaluation of\ntraining methods to improve generative large language models' (LLMs) ability to\nanswer queries on sensitive topics with a Neutral Point of View (NPOV), i.e.,\nto provide significantly more informative, diverse and impartial answers. The\ndataset, the SHQ-NPOV dataset, comprises 300 high-quality, human-written\nquadruplets: a query on a sensitive topic, an answer, an NPOV rating, and a set\nof links to source texts elaborating the various points of view. The first key\ncontribution of this paper is a new methodology to create such datasets through\niterative rounds of human peer-critique and annotator training, which we\nrelease alongside the dataset. The second key contribution is the\nidentification of a highly effective training regime for parameter-efficient\nreinforcement learning (PE-RL) to improve NPOV generation. We compare and\nextensively evaluate PE-RL and multiple baselines-including LoRA finetuning (a\nstrong baseline), SFT and RLHF.\n PE-RL not only improves on overall NPOV quality compared to the strongest\nbaseline ($97.06\\%\\rightarrow 99.08\\%$), but also scores much higher on\nfeatures linguists identify as key to separating good answers from the best\nanswers ($60.25\\%\\rightarrow 85.21\\%$ for presence of supportive details,\n$68.74\\%\\rightarrow 91.43\\%$ for absence of oversimplification). A qualitative\nanalysis corroborates this. Finally, our evaluation finds no statistical\ndifferences between results on topics that appear in the training dataset and\nthose on separated evaluation topics, which provides strong evidence that our\napproach to training PE-RL exhibits very effective out of topic generalization.\n","authors":["Jessica Hoffmann","Christiane Ahlheim","Zac Yu","Aria Walfrand","Jarvis Jin","Marie Tano","Ahmad Beirami","Erin van Liemt","Nithum Thain","Hakim Sidahmed","Lucas Dixon"],"pdf_url":"https://arxiv.org/pdf/2503.03654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03649v1","updated":"2025-03-05T16:25:58Z","published":"2025-03-05T16:25:58Z","title":"Limits of nonlinear and dispersive fiber propagation for photonic\n extreme learning","summary":" We report a generalized nonlinear Schr\\\"odinger equation simulation model of\nan extreme learning machine based on optical fiber propagation. Using\nhandwritten digit classification as a benchmark, we study how accuracy depends\non propagation dynamics, as well as parameters governing spectral encoding,\nreadout, and noise. Test accuracies of over 91% and 93% are found for\npropagation in the anomalous and normal dispersion regimes respectively. Our\nsimulation results also suggest that quantum noise on the input pulses\nintroduces an intrinsic penalty to ELM performance.\n","authors":["Andrei V. Ermolaev","Mathilde Hary","Lev Leybov","Piotr Ryczkowski","Anas Skalli","Daniel Brunner","Goëry Genty","John M. Dudley"],"pdf_url":"https://arxiv.org/pdf/2503.03649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03634v1","updated":"2025-03-05T16:14:43Z","published":"2025-03-05T16:14:43Z","title":"Feature Matching Intervention: Leveraging Observational Data for Causal\n Representation Learning","summary":" A major challenge in causal discovery from observational data is the absence\nof perfect interventions, making it difficult to distinguish causal features\nfrom spurious ones. We propose an innovative approach, Feature Matching\nIntervention (FMI), which uses a matching procedure to mimic perfect\ninterventions. We define causal latent graphs, extending structural causal\nmodels to latent feature space, providing a framework that connects FMI with\ncausal graph learning. Our feature matching procedure emulates perfect\ninterventions within these causal latent graphs. Theoretical results\ndemonstrate that FMI exhibits strong out-of-distribution (OOD)\ngeneralizability. Experiments further highlight FMI's superior performance in\neffectively identifying causal features solely from observational data.\n","authors":["Haoze Li","Jun Xie"],"pdf_url":"https://arxiv.org/pdf/2503.03634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.09647v2","updated":"2025-03-05T16:14:16Z","published":"2025-02-11T00:04:32Z","title":"Unveiling Simplicities of Attention: Adaptive Long-Context Head\n Identification","summary":" The ability to process long contexts is crucial for many natural language\nprocessing tasks, yet it remains a significant challenge. While substantial\nprogress has been made in enhancing the efficiency of attention mechanisms,\nthere is still a gap in understanding how attention heads function in\nlong-context settings. In this paper, we observe that while certain heads\nconsistently attend to local information only, others swing between attending\nto local and long-context information depending on the query. This raises the\nquestion: can we identify which heads require long-context information to\npredict the next token accurately? We demonstrate that it's possible to predict\nwhich heads are crucial for long-context processing using only local keys. The\ncore idea here is to exploit a simple model for the long-context scores via\nsecond moment approximations. These findings unveil simple properties of\nattention in the context of long sequences, and open the door to potentially\nsignificant gains in efficiency.\n","authors":["Konstantin Donhauser","Charles Arnal","Mohammad Pezeshki","Vivien Cabannes","David Lopez-Paz","Kartik Ahuja"],"pdf_url":"https://arxiv.org/pdf/2502.09647v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.10650v2","updated":"2025-03-05T16:11:42Z","published":"2025-02-15T03:03:09Z","title":"Generative Adversarial Networks for High-Dimensional Item Factor\n Analysis: A Deep Adversarial Learning Algorithm","summary":" Advances in deep learning and representation learning have transformed item\nfactor analysis (IFA) in the item response theory (IRT) literature by enabling\nmore efficient and accurate parameter estimation. Variational Autoencoders\n(VAEs) have been one of the most impactful techniques in modeling\nhigh-dimensional latent variables in this context. However, the limited\nexpressiveness of the inference model based on traditional VAEs can still\nhinder the estimation performance. We introduce Adversarial Variational Bayes\n(AVB) algorithms as an improvement to VAEs for IFA with improved flexibility\nand accuracy. By bridging the strengths of VAEs and Generative Adversarial\nNetworks (GANs), AVB incorporates an auxiliary discriminator network to reframe\nthe estimation process as a two-player adversarial game and removes the\nrestrictive assumption of standard normal distributions in the inference model.\nTheoretically, AVB can achieve similar or higher likelihood compared to VAEs. A\nfurther enhanced algorithm, Importance-weighted Adversarial Variational Bayes\n(IWAVB) is proposed and compared with Importance-weighted Autoencoders (IWAE).\nIn an exploratory analysis of empirical data, IWAVB demonstrated superior\nexpressiveness by achieving a higher likelihood compared to IWAE. In\nconfirmatory analysis with simulated data, IWAVB achieved similar mean-square\nerror results to IWAE while consistently achieving higher likelihoods. When\nlatent variables followed a multimodal distribution, IWAVB outperformed IWAE.\nWith its innovative use of GANs, IWAVB is shown to have the potential to extend\nIFA to handle large-scale data, facilitating the potential integration of\npsychometrics and multimodal data analysis.\n","authors":["Nanyu Luo","Feng Ji"],"pdf_url":"https://arxiv.org/pdf/2502.10650v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12126v2","updated":"2025-03-05T16:08:49Z","published":"2024-11-18T23:34:07Z","title":"MMBind: Unleashing the Potential of Distributed and Heterogeneous Data\n for Multimodal Learning in IoT","summary":" Multimodal sensing systems are increasingly prevalent in various real-world\napplications. Most existing multimodal learning approaches heavily rely on\ntraining with a large amount of synchronized, complete multimodal data.\nHowever, such a setting is impractical in real-world IoT sensing applications\nwhere data is typically collected by distributed nodes with heterogeneous data\nmodalities, and is also rarely labeled. In this paper, we propose MMBind, a new\ndata binding approach for multimodal learning on distributed and heterogeneous\nIoT data. The key idea of MMBind is to construct a pseudo-paired multimodal\ndataset for model training by binding data from disparate sources and\nincomplete modalities through a sufficiently descriptive shared modality. We\nalso propose a weighted contrastive learning approach to handle domain shifts\namong disparate data, coupled with an adaptive multimodal learning architecture\ncapable of training models with heterogeneous modality combinations.\nEvaluations on ten real-world multimodal datasets highlight that MMBind\noutperforms state-of-the-art baselines under varying degrees of data\nincompleteness and domain shift, and holds promise for advancing multimodal\nfoundation model training in IoT applications\\footnote (The source code is\navailable via https://github.com/nesl/multimodal-bind).\n","authors":["Xiaomin Ouyang","Jason Wu","Tomoyoshi Kimura","Yihan Lin","Gunjan Verma","Tarek Abdelzaher","Mani Srivastava"],"pdf_url":"https://arxiv.org/pdf/2411.12126v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.13921v2","updated":"2025-03-05T16:07:23Z","published":"2025-02-19T17:53:59Z","title":"Exploring Code Language Models for Automated HLS-based Hardware\n Generation: Benchmark, Infrastructure and Analysis","summary":" Recent advances in code generation have illuminated the potential of\nemploying large language models (LLMs) for general-purpose programming\nlanguages such as Python and C++, opening new opportunities for automating\nsoftware development and enhancing programmer productivity. The potential of\nLLMs in software programming has sparked significant interest in exploring\nautomated hardware generation and automation. Although preliminary endeavors\nhave been made to adopt LLMs in generating hardware description languages\n(HDLs), several challenges persist in this direction. First, the volume of\navailable HDL training data is substantially smaller compared to that for\nsoftware programming languages. Second, the pre-trained LLMs, mainly tailored\nfor software code, tend to produce HDL designs that are more error-prone.\nThird, the generation of HDL requires a significantly higher number of tokens\ncompared to software programming, leading to inefficiencies in cost and energy\nconsumption. To tackle these challenges, this paper explores leveraging LLMs to\ngenerate High-Level Synthesis (HLS)-based hardware design. Although code\ngeneration for domain-specific programming languages is not new in the\nliterature, we aim to provide experimental results, insights, benchmarks, and\nevaluation infrastructure to investigate the suitability of HLS over low-level\nHDLs for LLM-assisted hardware design generation. To achieve this, we first\nfinetune pre-trained models for HLS-based hardware generation, using a\ncollected dataset with text prompts and corresponding reference HLS designs. An\nLLM-assisted framework is then proposed to automate end-to-end hardware code\ngeneration, which also investigates the impact of chain-of-thought and feedback\nloops promoting techniques on HLS-design generation. Limited by the timeframe\nof this research, we plan to evaluate more advanced reasoning models in the\nfuture.\n","authors":["Jiahao Gai","Hao Mark Chen","Zhican Wang","Hongyu Zhou","Wanru Zhao","Nicholas Lane","Hongxiang Fan"],"pdf_url":"https://arxiv.org/pdf/2502.13921v2.pdf","comment":"Paper accepted by ASP-DAC'25"},{"id":"http://arxiv.org/abs/2409.06615v5","updated":"2025-03-05T16:07:20Z","published":"2024-09-10T16:11:57Z","title":"One-Shot Imitation under Mismatched Execution","summary":" Human demonstrations as prompts are a powerful way to program robots to do\nlong-horizon manipulation tasks. However, translating these demonstrations into\nrobot-executable actions presents significant challenges due to execution\nmismatches in movement styles and physical capabilities. Existing methods\neither depend on human-robot paired data, which is infeasible to scale, or rely\nheavily on frame-level visual similarities that often break down in practice.\nTo address these challenges, we propose RHyME, a novel framework that\nautomatically aligns human and robot task executions using optimal transport\ncosts. Given long-horizon robot demonstrations, RHyME synthesizes semantically\nequivalent human videos by retrieving and composing short-horizon human clips.\nThis approach facilitates effective policy training without the need for paired\ndata. RHyME successfully imitates a range of cross-embodiment demonstrators,\nboth in simulation and with a real human hand, achieving over 50\\% increase in\ntask success compared to previous methods. We release our code and datasets at\nhttps://portal-cornell.github.io/rhyme/.\n","authors":["Kushal Kedia","Prithwish Dan","Angela Chao","Maximus Adrian Pace","Sanjiban Choudhury"],"pdf_url":"https://arxiv.org/pdf/2409.06615v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03625v1","updated":"2025-03-05T16:05:26Z","published":"2025-03-05T16:05:26Z","title":"Deterministic Global Optimization of the Acquisition Function in\n Bayesian Optimization: To Do or Not To Do?","summary":" Bayesian Optimization (BO) with Gaussian Processes relies on optimizing an\nacquisition function to determine sampling. We investigate the advantages and\ndisadvantages of using a deterministic global solver (MAiNGO) compared to\nconventional local and stochastic global solvers (L-BFGS-B and multi-start,\nrespectively) for the optimization of the acquisition function. For CPU\nefficiency, we set a time limit for MAiNGO, taking the best point as optimal.\nWe perform repeated numerical experiments, initially using the Muller-Brown\npotential as a benchmark function, utilizing the lower confidence bound\nacquisition function; we further validate our findings with three alternative\nbenchmark functions. Statistical analysis reveals that when the acquisition\nfunction is more exploitative (as opposed to exploratory), BO with MAiNGO\nconverges in fewer iterations than with the local solvers. However, when the\ndataset lacks diversity, or when the acquisition function is overly\nexploitative, BO with MAiNGO, compared to the local solvers, is more likely to\nconverge to a local rather than a global ly near-optimal solution of the\nblack-box function. L-BFGS-B and multi-start mitigate this risk in BO by\nintroducing stochasticity in the selection of the next sampling point, which\nenhances the exploration of uncharted regions in the search space and reduces\ndependence on acquisition function hyperparameters. Ultimately, suboptimal\noptimization of poorly chosen acquisition functions may be preferable to their\noptimal solution. When the acquisition function is more exploratory, BO with\nMAiNGO, multi-start, and L-BFGS-B achieve comparable probabilities of\nconvergence to a globally near-optimal solution (although BO with MAiNGO may\nrequire more iterations to converge under these conditions).\n","authors":["Anastasia Georgiou","Daniel Jungen","Luise Kaven","Verena Hunstig","Constantine Frangakis","Ioannis Kevrekidis","Alexander Mitsos"],"pdf_url":"https://arxiv.org/pdf/2503.03625v1.pdf","comment":"32 pages, 7 figures, 7 tables"},{"id":"http://arxiv.org/abs/2503.03622v1","updated":"2025-03-05T16:02:09Z","published":"2025-03-05T16:02:09Z","title":"It's My Data Too: Private ML for Datasets with Multi-User Training\n Examples","summary":" We initiate a study of algorithms for model training with user-level\ndifferential privacy (DP), where each example may be attributed to multiple\nusers, which we call the multi-attribution model. We first provide a carefully\nchosen definition of user-level DP under the multi-attribution model. Training\nin the multi-attribution model is facilitated by solving the contribution\nbounding problem, i.e. the problem of selecting a subset of the dataset for\nwhich each user is associated with a limited number of examples. We propose a\ngreedy baseline algorithm for the contribution bounding problem. We then\nempirically study this algorithm for a synthetic logistic regression task and a\ntransformer training task, including studying variants of this baseline\nalgorithm that optimize the subset chosen using different techniques and\ncriteria. We find that the baseline algorithm remains competitive with its\nvariants in most settings, and build a better understanding of the practical\nimportance of a bias-variance tradeoff inherent in solutions to the\ncontribution bounding problem.\n","authors":["Arun Ganesh","Ryan McKenna","Brendan McMahan","Adam Smith","Fan Wu"],"pdf_url":"https://arxiv.org/pdf/2503.03622v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06712v4","updated":"2025-03-05T15:40:42Z","published":"2024-07-09T09:39:45Z","title":"MDP Geometry, Normalization and Reward Balancing Solvers","summary":" We present a new geometric interpretation of Markov Decision Processes (MDPs)\nwith a natural normalization procedure that allows us to adjust the value\nfunction at each state without altering the advantage of any action with\nrespect to any policy. This advantage-preserving transformation of the MDP\nmotivates a class of algorithms which we call Reward Balancing, which solve\nMDPs by iterating through these transformations, until an approximately optimal\npolicy can be trivially found. We provide a convergence analysis of several\nalgorithms in this class, in particular showing that for MDPs for unknown\ntransition probabilities we can improve upon state-of-the-art sample complexity\nresults.\n","authors":["Arsenii Mustafin","Aleksei Pakharev","Alex Olshevsky","Ioannis Ch. Paschalidis"],"pdf_url":"https://arxiv.org/pdf/2407.06712v4.pdf","comment":"AISTATS 2025 camera-ready version"},{"id":"http://arxiv.org/abs/2501.06058v3","updated":"2025-03-05T15:37:52Z","published":"2025-01-10T15:39:39Z","title":"Capability-Aware Shared Hypernetworks for Flexible Heterogeneous\n Multi-Robot Coordination","summary":" Recent advances have enabled heterogeneous multi-robot teams to learn complex\nand effective coordination. However, existing architectural designs that\nsupport heterogeneous teams tend to force a trade-off between expressivity and\nefficiency. Some attempt to encode diverse behaviors within a single shared\narchitecture by appending the input with an ID unique to each robot or robot\ntype. These designs improve sample and parameter efficiency but tend to limit\nbehavioral diversity. Others use a separate policy for each robot, enabling\ngreater diversity at the cost of efficiency and generalization. We view these\ntwo designs as ends of a spectrum and explore a middle-ground approach that\nenables efficient learning of diverse behaviors. Inspired by work in transfer\nlearning and meta RL, and building upon prior work in trait-based task\nallocation, we propose Capability-Aware Shared Hypernetworks (CASH), a\ngeneral-purpose soft weight sharing architecture that uses hypernetworks to\nenable a single architecture to dynamically adapt to each robot and the current\ncontext. Intuitively, CASH encodes shared decision making strategies that can\nbe adapted to each robot based on local observations and the robots' individual\nand collective capabilities (e.g., speed and payload). CASH explicitly captures\nthe impact of capabilities on collective behavior, enabling zero-shot\ngeneralization to unseen robots or team compositions. We conducted experiments\nacross four heterogeneous coordination tasks and three learning paradigms\n(imitation learning, value-based, and policy-gradient RL) using SOTA\nmulti-robot simulation (JaxMARL) and hardware (Robotarium) platforms. Across\nall conditions, CASH generates appropriately diverse behaviors and outperforms\nbaseline architectures in task performance and sample efficiency during\ntraining and zero-shot generalization while utilizing 60%-80% fewer learnable\nparameters.\n","authors":["Kevin Fu","Shalin Jain","Pierce Howell","Harish Ravichandar"],"pdf_url":"https://arxiv.org/pdf/2501.06058v3.pdf","comment":"16 pages, 8 figures, equal authorship between Kevin Fu and Shalin\n Jain"},{"id":"http://arxiv.org/abs/2409.16720v2","updated":"2025-03-05T15:35:47Z","published":"2024-09-25T08:09:52Z","title":"Dashing for the Golden Snitch: Multi-Drone Time-Optimal Motion Planning\n with Multi-Agent Reinforcement Learning","summary":" Recent innovations in autonomous drones have facilitated time-optimal flight\nin single-drone configurations, and enhanced maneuverability in multi-drone\nsystems by applying optimal control and learning-based methods. However, few\nstudies have achieved time-optimal motion planning for multi-drone systems,\nparticularly during highly agile maneuvers or in dynamic scenarios. This paper\npresents a decentralized policy network using multi-agent reinforcement\nlearning for time-optimal multi-drone flight. To strike a balance between\nflight efficiency and collision avoidance, we introduce a soft collision-free\nmechanism inspired by optimization-based methods. By customizing PPO in a\ncentralized training, decentralized execution (CTDE) fashion, we unlock higher\nefficiency and stability in training while ensuring lightweight implementation.\nExtensive simulations show that, despite slight performance trade-offs compared\nto single-drone systems, our multi-drone approach maintains near-time-optimal\nperformance with a low collision rate. Real-world experiments validate our\nmethod, with two quadrotors using the same network as in simulation achieving a\nmaximum speed of 13.65 m/s and a maximum body rate of 13.4 rad/s in a 5.5 m *\n5.5 m * 2.0 m space across various tracks, relying entirely on onboard\ncomputation.\n","authors":["Xian Wang","Jin Zhou","Yuanli Feng","Jiahao Mei","Jiming Chen","Shuo Li"],"pdf_url":"https://arxiv.org/pdf/2409.16720v2.pdf","comment":"v2: 7 pages, 6 figures; terminology corrected, algorithmic and\n equation descriptions revised, references added"},{"id":"http://arxiv.org/abs/2405.15389v3","updated":"2025-03-05T15:35:35Z","published":"2024-05-24T09:41:06Z","title":"Beyond Canonicalization: How Tensorial Messages Improve Equivariant\n Message Passing","summary":" In numerous applications of geometric deep learning, the studied systems\nexhibit spatial symmetries and it is desirable to enforce these. For the\nsymmetry of global rotations and reflections, this means that the model should\nbe equivariant with respect to the transformations that form the group of\n$\\mathrm O(d)$. While many approaches for equivariant message passing require\nspecialized architectures, including non-standard normalization layers or\nnon-linearities, we here present a framework based on local reference frames\n(\"local canonicalization\") which can be integrated with any architecture\nwithout restrictions. We enhance equivariant message passing based on local\ncanonicalization by introducing tensorial messages to communicate geometric\ninformation consistently between different local coordinate frames. Our\nframework applies to message passing on geometric data in Euclidean spaces of\narbitrary dimension. We explicitly show how our approach can be adapted to make\na popular existing point cloud architecture equivariant. We demonstrate the\nsuperiority of tensorial messages and achieve state-of-the-art results on\nnormal vector regression and competitive results on other standard 3D point\ncloud tasks.\n","authors":["Peter Lippmann","Gerrit Gerhartz","Roman Remme","Fred A. Hamprecht"],"pdf_url":"https://arxiv.org/pdf/2405.15389v3.pdf","comment":"To be published in proceedings of ICLR 2025"},{"id":"http://arxiv.org/abs/2412.00980v2","updated":"2025-03-05T15:32:01Z","published":"2024-12-01T22:04:12Z","title":"Incentivizing Truthful Collaboration in Heterogeneous Federated Learning","summary":" Federated learning (FL) is a distributed collaborative learning method, where\nmultiple clients learn together by sharing gradient updates instead of raw\ndata. However, it is well-known that FL is vulnerable to manipulated updates\nfrom clients. In this work we study the impact of data heterogeneity on\nclients' incentives to manipulate their updates. First, we present\nheterogeneous collaborative learning scenarios where a client can modify their\nupdates to be better off, and show that these manipulations can lead to\ndiminishing model performance. To prevent such modifications, we formulate a\ngame in which clients may misreport their gradient updates in order to \"steer\"\nthe server model to their advantage. We develop a payment rule that provably\ndisincentivizes sending modified updates under the FedSGD protocol. We derive\nexplicit bounds on the clients' payments and the convergence rate of the global\nmodel, which allows us to study the trade-off between heterogeneity, payments\nand convergence. Finally, we provide an experimental evaluation of the\neffectiveness of our payment rule in the FedSGD, median-based aggregation\nFedSGD and FedAvg protocols on three tasks in computer vision and natural\nlanguage processing. In all cases we find that our scheme successfully\ndisincentivizes modifications.\n","authors":["Dimitar Chakarov","Nikita Tsoy","Kristian Minchev","Nikola Konstantinov"],"pdf_url":"https://arxiv.org/pdf/2412.00980v2.pdf","comment":"29 pages, 8 figures"},{"id":"http://arxiv.org/abs/2503.03595v1","updated":"2025-03-05T15:28:50Z","published":"2025-03-05T15:28:50Z","title":"Towards Understanding Text Hallucination of Diffusion Models via Local\n Generation Bias","summary":" Score-based diffusion models have achieved incredible performance in\ngenerating realistic images, audio, and video data. While these models produce\nhigh-quality samples with impressive details, they often introduce unrealistic\nartifacts, such as distorted fingers or hallucinated texts with no meaning.\nThis paper focuses on textual hallucinations, where diffusion models correctly\ngenerate individual symbols but assemble them in a nonsensical manner. Through\nexperimental probing, we consistently observe that such phenomenon is\nattributed it to the network's local generation bias. Denoising networks tend\nto produce outputs that rely heavily on highly correlated local regions,\nparticularly when different dimensions of the data distribution are nearly\npairwise independent. This behavior leads to a generation process that\ndecomposes the global distribution into separate, independent distributions for\neach symbol, ultimately failing to capture the global structure, including\nunderlying grammar. Intriguingly, this bias persists across various denoising\nnetwork architectures including MLP and transformers which have the structure\nto model global dependency. These findings also provide insights into\nunderstanding other types of hallucinations, extending beyond text, as a result\nof implicit biases in the denoising models. Additionally, we theoretically\nanalyze the training dynamics for a specific case involving a two-layer MLP\nlearning parity points on a hypercube, offering an explanation of its\nunderlying mechanism.\n","authors":["Rui Lu","Runzhe Wang","Kaifeng Lyu","Xitai Jiang","Gao Huang","Mengdi Wang"],"pdf_url":"https://arxiv.org/pdf/2503.03595v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01999v2","updated":"2025-03-05T15:26:17Z","published":"2025-01-01T07:00:41Z","title":"On the Utility of Equivariance and Symmetry Breaking in Deep Learning\n Architectures on Point Clouds","summary":" This paper explores the key factors that influence the performance of models\nworking with point clouds, across different tasks of varying geometric\ncomplexity. In this work, we explore the trade-offs between flexibility and\nweight-sharing introduced by equivariant layers, assessing when equivariance\nboosts or detracts from performance. It is often argued that providing more\ninformation as input improves a model's performance. However, if this\nadditional information breaks certain properties, such as $\\SE(3)$\nequivariance, does it remain beneficial? We identify the key aspects of\nequivariant and non-equivariant architectures that drive success in different\ntasks by benchmarking them on segmentation, regression, and generation tasks\nacross multiple datasets with increasing complexity. We observe a positive\nimpact of equivariance, which becomes more pronounced with increasing task\ncomplexity, even when strict equivariance is not required.\n","authors":["Sharvaree Vadgama","Mohammad Mohaiminul Islam","Domas Buracus","Christian Shewmake","Erik Bekkers"],"pdf_url":"https://arxiv.org/pdf/2501.01999v2.pdf","comment":"19 pages, 4 figures"},{"id":"http://arxiv.org/abs/2503.03588v1","updated":"2025-03-05T15:24:11Z","published":"2025-03-05T15:24:11Z","title":"PowerAttention: Exponentially Scaling of Receptive Fields for Effective\n Sparse Attention","summary":" Large Language Models (LLMs) face efficiency bottlenecks due to the quadratic\ncomplexity of the attention mechanism when processing long contexts. Sparse\nattention methods offer a promising solution, but existing approaches often\nsuffer from incomplete effective context and/or require complex implementation\nof pipeline. We present a comprehensive analysis of sparse attention for\nautoregressive LLMs from the respective of receptive field, recognize the\nsuboptimal nature of existing methods for expanding the receptive field, and\nintroduce PowerAttention, a novel sparse attention design that facilitates\neffective and complete context extension through the theoretical analysis.\nPowerAttention achieves exponential receptive field growth in $d$-layer LLMs,\nallowing each output token to attend to $2^d$ tokens, ensuring completeness and\ncontinuity of the receptive field. Experiments demonstrate that PowerAttention\noutperforms existing static sparse attention methods by $5\\sim 40\\%$,\nespecially on tasks demanding long-range dependencies like Passkey Retrieval\nand RULER, while maintaining a comparable time complexity to sliding window\nattention. Efficiency evaluations further highlight PowerAttention's superior\nspeedup in both prefilling and decoding phases compared with dynamic sparse\nattentions and full attention ($3.0\\times$ faster on 128K context), making it a\nhighly effective and user-friendly solution for processing long sequences in\nLLMs.\n","authors":["Lida Chen","Dong Xu","Chenxin An","Xintao Wang","Yikai Zhang","Jiangjie Chen","Zujie Liang","Feng Wei","Jiaqing Liang","Yanghua Xiao","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2503.03588v1.pdf","comment":"for associated code, see https://github.com/w568w/PowerAttention"},{"id":"http://arxiv.org/abs/2503.03579v1","updated":"2025-03-05T15:13:54Z","published":"2025-03-05T15:13:54Z","title":"A Generative System for Robot-to-Human Handovers: from Intent Inference\n to Spatial Configuration Imagery","summary":" We propose a novel system for robot-to-human object handover that emulates\nhuman coworker interactions. Unlike most existing studies that focus primarily\non grasping strategies and motion planning, our system focus on 1. inferring\nhuman handover intents, 2. imagining spatial handover configuration. The first\none integrates multimodal perception-combining visual and verbal cues-to infer\nhuman intent. The second one using a diffusion-based model to generate the\nhandover configuration, involving the spacial relationship among robot's\ngripper, the object, and the human hand, thereby mimicking the cognitive\nprocess of motor imagery. Experimental results demonstrate that our approach\neffectively interprets human cues and achieves fluent, human-like handovers,\noffering a promising solution for collaborative robotics. Code, videos, and\ndata are available at: https://i3handover.github.io.\n","authors":["Hanxin Zhang","Abdulqader Dhafer","Zhou Daniel Hao","Hongbiao Dong"],"pdf_url":"https://arxiv.org/pdf/2503.03579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.12395v2","updated":"2025-03-05T15:10:51Z","published":"2025-02-18T00:06:40Z","title":"Efficient Neural SDE Training using Wiener-Space Cubature","summary":" A neural stochastic differential equation (SDE) is an SDE with drift and\ndiffusion terms parametrized by neural networks. The training procedure for\nneural SDEs consists of optimizing the SDE vector field (neural network)\nparameters to minimize the expected value of an objective functional on\ninfinite-dimensional path-space. Existing training techniques focus on methods\nto efficiently compute path-wise gradients of the objective functional with\nrespect to these parameters, then pair this with Monte-Carlo simulation to\nestimate the expectation, and stochastic gradient descent to optimize. In this\nwork we introduce a novel training technique which bypasses and improves upon\nMonte-Carlo simulation; we extend results in the theory of Wiener-space\ncubature to approximate the expected objective functional by a weighted sum of\ndeterministic ODE solutions. This allows us to compute gradients by efficient\nODE adjoint methods. Furthermore, we exploit a high-order recombination scheme\nto drastically reduce the number of ODE solutions necessary to achieve a\nreasonable approximation. We show that this Wiener-space cubature approach can\nsurpass the O(1/sqrt(n)) rate of Monte-Carlo simulation, or the O(log(n)/n)\nrate of quasi-Monte-Carlo, to achieve a O(1/n) rate under reasonable\nassumptions.\n","authors":["Luke Snow","Vikram Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2502.12395v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03576v1","updated":"2025-03-05T15:02:46Z","published":"2025-03-05T15:02:46Z","title":"Optimal Decision Tree Pruning Revisited: Algorithms and Complexity","summary":" We present a comprehensive classical and parameterized complexity analysis of\ndecision tree pruning operations, extending recent research on the complexity\nof learning small decision trees. Thereby, we offer new insights into the\ncomputational challenges of decision tree simplification, a crucial aspect of\ndeveloping interpretable and efficient machine learning models. We focus on\nfundamental pruning operations of subtree replacement and raising, which are\nused in heuristics. Surprisingly, while optimal pruning can be performed in\npolynomial time for subtree replacement, the problem is NP-complete for subtree\nraising. Therefore, we identify parameters and combinations thereof that lead\nto fixed-parameter tractability or hardness, establishing a precise borderline\nbetween these complexity classes. For example, while subtree raising is hard\nfor small domain size $D$ or number $d$ of features, it can be solved in\n$D^{2d} \\cdot |I|^{O(1)}$ time, where $|I|$ is the input size. We complement\nour theoretical findings with preliminary experimental results, demonstrating\nthe practical implications of our analysis.\n","authors":["Juha Harviainen","Frank Sommer","Manuel Sorge","Stefan Szeider"],"pdf_url":"https://arxiv.org/pdf/2503.03576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03574v1","updated":"2025-03-05T15:01:56Z","published":"2025-03-05T15:01:56Z","title":"Olympus: A Jumping Quadruped for Planetary Exploration Utilizing\n Reinforcement Learning for In-Flight Attitude Control","summary":" Exploring planetary bodies with lower gravity, such as the moon and Mars,\nallows legged robots to utilize jumping as an efficient form of locomotion thus\ngiving them a valuable advantage over traditional rovers for exploration.\nMotivated by this fact, this paper presents the design, simulation, and\nlearning-based \"in-flight\" attitude control of Olympus, a jumping legged robot\ntailored to the gravity of Mars. First, the design requirements are outlined\nfollowed by detailing how simulation enabled optimizing the robot's design -\nfrom its legs to the overall configuration - towards high vertical jumping,\nforward jumping distance, and in-flight attitude reorientation. Subsequently,\nthe reinforcement learning policy used to track desired in-flight attitude\nmaneuvers is presented. Successfully crossing the sim2real gap, extensive\nexperimental studies of attitude reorientation tests are demonstrated.\n","authors":["Jørgen Anker Olsen","Grzegorz Malczyk","Kostas Alexis"],"pdf_url":"https://arxiv.org/pdf/2503.03574v1.pdf","comment":"7 pages, 6 figures, Accepted to the IEEE International Conference on\n Robotics and Automation (ICRA) 2025"},{"id":"http://arxiv.org/abs/2503.03571v1","updated":"2025-03-05T15:00:39Z","published":"2025-03-05T15:00:39Z","title":"Domain Consistent Industrial Decarbonisation of Global Coal Power Plants","summary":" Machine learning and optimisation techniques (MLOPT) hold significant\npotential to accelerate the decarbonisation of industrial systems by enabling\ndata-driven operational improvements. However, the practical application of\nMLOPT in industrial settings is often hindered by a lack of domain compliance\nand system-specific consistency, resulting in suboptimal solutions with limited\nreal-world applicability. To address this challenge, we propose a novel\nhuman-in-the-loop (HITL) constraint-based optimisation framework that\nintegrates domain expertise with data-driven methods, ensuring solutions are\nboth technically sound and operationally feasible. We demonstrate the efficacy\nof this framework through a case study focused on enhancing the thermal\nefficiency and reducing the turbine heat rate of a 660 MW supercritical\ncoal-fired power plant. By embedding domain knowledge as constraints within the\noptimisation process, our approach yields solutions that align with the plant's\noperational patterns and are seamlessly integrated into its control systems.\nEmpirical validation confirms a mean improvement in thermal efficiency of\n0.64\\% and a mean reduction in turbine heat rate of 93 kJ/kWh. Scaling our\nanalysis to 59 global coal power plants with comparable capacity and fuel type,\nwe estimate a cumulative lifetime reduction of 156.4 million tons of carbon\nemissions. These results underscore the transformative potential of our\nHITL-MLOPT framework in delivering domain-compliant, implementable solutions\nfor industrial decarbonisation, offering a scalable pathway to mitigate the\nenvironmental impact of coal-based power generation worldwide.\n","authors":["Waqar Muhammad Ashraf","Vivek Dua","Ramit Debnath"],"pdf_url":"https://arxiv.org/pdf/2503.03571v1.pdf","comment":"6 figures. 17 pages"},{"id":"http://arxiv.org/abs/2503.03565v1","updated":"2025-03-05T14:53:32Z","published":"2025-03-05T14:53:32Z","title":"Probabilistic Insights for Efficient Exploration Strategies in\n Reinforcement Learning","summary":" We investigate efficient exploration strategies of environments with unknown\nstochastic dynamics and sparse rewards. Specifically, we analyze first the\nimpact of parallel simulations on the probability of reaching rare states\nwithin a finite time budget. Using simplified models based on random walks and\nL\\'evy processes, we provide analytical results that demonstrate a phase\ntransition in reaching probabilities as a function of the number of parallel\nsimulations. We identify an optimal number of parallel simulations that\nbalances exploration diversity and time allocation. Additionally, we analyze a\nrestarting mechanism that exponentially enhances the probability of success by\nredirecting efforts toward more promising regions of the state space. Our\nfindings contribute to a more qualitative and quantitative theory of some\nexploration schemes in reinforcement learning, offering insights into\ndeveloping more efficient strategies for environments characterized by rare\nevents.\n","authors":["Ernesto Garcia","Paola Bermolen","Matthieu Jonckheere","Seva Shneer"],"pdf_url":"https://arxiv.org/pdf/2503.03565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03561v1","updated":"2025-03-05T14:49:06Z","published":"2025-03-05T14:49:06Z","title":"Transformer-Based Power Optimization for Max-Min Fairness in Cell-Free\n Massive MIMO","summary":" Power allocation is an important task in wireless communication networks.\nClassical optimization algorithms and deep learning methods, while effective in\nsmall and static scenarios, become either computationally demanding or\nunsuitable for large and dynamic networks with varying user loads. This letter\nexplores the potential of transformer-based deep learning models to address\nthese challenges. We propose a transformer neural network to jointly predict\noptimal uplink and downlink power using only user and access point positions.\nThe max-min fairness problem in cell-free massive multiple input multiple\noutput systems is considered. Numerical results show that the trained model\nprovides near-optimal performance and adapts to varying numbers of users and\naccess points without retraining, additional processing, or updating its neural\nnetwork architecture. This demonstrates the effectiveness of the proposed model\nin achieving robust and flexible power allocation for dynamic networks.\n","authors":["Irched Chafaa","Giacomo Bacci","Luca Sanguinetti"],"pdf_url":"https://arxiv.org/pdf/2503.03561v1.pdf","comment":"5 pages, IEEE WCL, 4 FIGURES"},{"id":"http://arxiv.org/abs/2407.16205v5","updated":"2025-03-05T14:43:33Z","published":"2024-07-23T06:14:41Z","title":"LLMs can be Dangerous Reasoners: Analyzing-based Jailbreak Attack on\n Large Language Models","summary":" The rapid development of Large Language Models (LLMs) has brought significant\nadvancements across various tasks. However, despite these achievements, LLMs\nstill exhibit inherent safety vulnerabilities, especially when confronted with\njailbreak attacks. Existing jailbreak methods suffer from two main limitations:\nreliance on complicated prompt engineering and iterative optimization, which\nlead to low attack success rate (ASR) and attack efficiency (AE). In this work,\nwe propose an efficient jailbreak attack method, Analyzing-based Jailbreak\n(ABJ), which leverages the advanced reasoning capability of LLMs to\nautonomously generate harmful content, revealing their underlying safety\nvulnerabilities during complex reasoning process. We conduct comprehensive\nexperiments on ABJ across various open-source and closed-source LLMs. In\nparticular, ABJ achieves high ASR (82.1% on GPT-4o-2024-11-20) with exceptional\nAE among all target LLMs, showcasing its remarkable attack effectiveness,\ntransferability, and efficiency. Our findings underscore the urgent need to\nprioritize and improve the safety of LLMs to mitigate the risks of misuse.\n","authors":["Shi Lin","Hongming Yang","Dingyang Lin","Rongchang Li","Xun Wang","Changting Lin","Wenpeng Xing","Meng Han"],"pdf_url":"https://arxiv.org/pdf/2407.16205v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.07115v3","updated":"2025-03-05T14:43:01Z","published":"2025-02-10T23:11:44Z","title":"Online Scheduling for LLM Inference with KV Cache Constraints","summary":" Large Language Model (LLM) inference, where a trained model generates text\none word at a time in response to user prompts, is a computationally intensive\nprocess requiring efficient scheduling to optimize latency and resource\nutilization. A key challenge in LLM inference is the management of the\nKey-Value (KV) cache, which reduces redundant computations but introduces\nmemory constraints. In this work, we model LLM inference with KV cache\nconstraints theoretically and propose novel batching and scheduling algorithms\nthat minimize inference latency while effectively managing the KV cache's\nmemory.\n We analyze both semi-online and fully online scheduling models, and our\nresults are threefold. First, we provide a polynomial-time algorithm that\nachieves exact optimality in terms of average latency in the semi-online prompt\narrival model. Second, in the fully online case with a stochastic prompt\narrival, we introduce an efficient online scheduling algorithm with constant\nregret. Third, we prove that no algorithm (deterministic or randomized) can\nachieve a constant competitive ratio in fully online adversarial settings. Our\nempirical evaluations on a public LLM inference dataset, using the Llama-70B\nmodel on A100 GPUs, show that our approach significantly outperforms benchmark\nalgorithms used currently in practice, achieving lower latency while reducing\nenergy consumption. Overall, our results offer a path toward more sustainable\nand cost-effective LLM deployment.\n","authors":["Patrick Jaillet","Jiashuo Jiang","Chara Podimata","Zijie Zhou"],"pdf_url":"https://arxiv.org/pdf/2502.07115v3.pdf","comment":"Will add a lemma in the proof of Theorem 5.3 to make the statement\n and proof more rigorous"},{"id":"http://arxiv.org/abs/2503.03548v1","updated":"2025-03-05T14:32:32Z","published":"2025-03-05T14:32:32Z","title":"Simulation-Based Performance Evaluation of 3D Object Detection Methods\n with Deep Learning for a LiDAR Point Cloud Dataset in a SOTIF-related Use\n Case","summary":" Safety of the Intended Functionality (SOTIF) addresses sensor performance\nlimitations and deep learning-based object detection insufficiencies to ensure\nthe intended functionality of Automated Driving Systems (ADS). This paper\npresents a methodology examining the adaptability and performance evaluation of\nthe 3D object detection methods on a LiDAR point cloud dataset generated by\nsimulating a SOTIF-related Use Case. The major contributions of this paper\ninclude defining and modelling a SOTIF-related Use Case with 21 diverse weather\nconditions and generating a LiDAR point cloud dataset suitable for application\nof 3D object detection methods. The dataset consists of 547 frames,\nencompassing clear, cloudy, rainy weather conditions, corresponding to\ndifferent times of the day, including noon, sunset, and night. Employing\nMMDetection3D and OpenPCDET toolkits, the performance of State-of-the-Art\n(SOTA) 3D object detection methods is evaluated and compared by testing the\npre-trained Deep Learning (DL) models on the generated dataset using Average\nPrecision (AP) and Recall metrics.\n","authors":["Milin Patel","Rolf Jung"],"pdf_url":"https://arxiv.org/pdf/2503.03548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03545v1","updated":"2025-03-05T14:28:38Z","published":"2025-03-05T14:28:38Z","title":"Revisiting the Role of Relearning in Semantic Dementia","summary":" Patients with semantic dementia (SD) present with remarkably consistent\natrophy of neurons in the anterior temporal lobe and behavioural impairments,\nsuch as graded loss of category knowledge. While relearning of lost knowledge\nhas been shown in acute brain injuries such as stroke, it has not been widely\nsupported in chronic cognitive diseases such as SD. Previous research has shown\nthat deep linear artificial neural networks exhibit stages of semantic learning\nakin to humans. Here, we use a deep linear network to test the hypothesis that\nrelearning during disease progression rather than particular atrophy cause the\nspecific behavioural patterns associated with SD. After training the network to\ngenerate the common semantic features of various hierarchically organised\nobjects, neurons are successively deleted to mimic atrophy while retraining the\nmodel. The model with relearning and deleted neurons reproduced errors specific\nto SD, including prototyping errors and cross-category confusions. This\nsuggests that relearning is necessary for artificial neural networks to\nreproduce the behavioural patterns associated with SD in the absence of\n\\textit{output} non-linearities. Our results support a theory of SD progression\nthat results from continuous relearning of lost information. Future research\nshould revisit the role of relearning as a contributing factor to cognitive\ndiseases.\n","authors":["Devon Jarvis","Verena Klar","Richard Klein","Benjamin Rosman","Andrew Saxe"],"pdf_url":"https://arxiv.org/pdf/2503.03545v1.pdf","comment":"3 pages, 2 figures, presented at the Cognitive Computational\n Neuroscience Conference (CCN) 2023"},{"id":"http://arxiv.org/abs/2409.16502v2","updated":"2025-03-05T14:11:44Z","published":"2024-09-24T23:18:32Z","title":"GSplatLoc: Grounding Keypoint Descriptors into 3D Gaussian Splatting for\n Improved Visual Localization","summary":" Although various visual localization approaches exist, such as scene\ncoordinate regression and camera pose regression, these methods often struggle\nwith optimization complexity or limited accuracy. To address these challenges,\nwe explore the use of novel view synthesis techniques, particularly 3D Gaussian\nSplatting (3DGS), which enables the compact encoding of both 3D geometry and\nscene appearance. We propose a two-stage procedure that integrates dense and\nrobust keypoint descriptors from the lightweight XFeat feature extractor into\n3DGS, enhancing performance in both indoor and outdoor environments. The coarse\npose estimates are directly obtained via 2D-3D correspondences between the 3DGS\nrepresentation and query image descriptors. In the second stage, the initial\npose estimate is refined by minimizing the rendering-based photometric warp\nloss. Benchmarking on widely used indoor and outdoor datasets demonstrates\nimprovements over recent neural rendering-based localization methods, such as\nNeRFMatch and PNeRFLoc.\n","authors":["Gennady Sidorov","Malik Mohrat","Denis Gridusov","Ruslan Rakhimov","Sergey Kolyubin"],"pdf_url":"https://arxiv.org/pdf/2409.16502v2.pdf","comment":"Project website at https://gsplatloc.github.io/"},{"id":"http://arxiv.org/abs/2503.03524v1","updated":"2025-03-05T14:08:53Z","published":"2025-03-05T14:08:53Z","title":"Intrinsic and Extrinsic Factor Disentanglement for Recommendation in\n Various Context Scenarios","summary":" In recommender systems, the patterns of user behaviors (e.g., purchase,\nclick) may vary greatly in different contexts (e.g., time and location). This\nis because user behavior is jointly determined by two types of factors:\nintrinsic factors, which reflect consistent user preference, and extrinsic\nfactors, which reflect external incentives that may vary in different contexts.\nDifferentiating between intrinsic and extrinsic factors helps learn user\nbehaviors better. However, existing studies have only considered\ndifferentiating them from a single, pre-defined context (e.g., time or\nlocation), ignoring the fact that a user's extrinsic factors may be influenced\nby the interplay of various contexts at the same time. In this paper, we\npropose the Intrinsic-Extrinsic Disentangled Recommendation (IEDR) model, a\ngeneric framework that differentiates intrinsic from extrinsic factors\nconsidering various contexts simultaneously, enabling more accurate\ndifferentiation of factors and hence the improvement of recommendation\naccuracy. IEDR contains a context-invariant contrastive learning component to\ncapture intrinsic factors, and a disentanglement component to extract extrinsic\nfactors under the interplay of various contexts. The two components work\ntogether to achieve effective factor learning. Extensive experiments on\nreal-world datasets demonstrate IEDR's effectiveness in learning disentangled\nfactors and significantly improving recommendation accuracy by up to 4% in\nNDCG.\n","authors":["Yixin Su","Wei Jiang","Fangquan Lin","Cheng Yang","Sarah M. Erfani","Junhao Gan","Yunxiang Zhao","Ruixuan Li","Rui Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.03524v1.pdf","comment":"32 pages, 13 figures, 11 tables. Accepted by Transactions of\n Information Systems"},{"id":"http://arxiv.org/abs/2503.03523v1","updated":"2025-03-05T14:07:29Z","published":"2025-03-05T14:07:29Z","title":"O-RAN xApps Conflict Management using Graph Convolutional Networks","summary":" Open Radio Access Network (O-RAN) adopts a flexible, open, and virtualized\nstructure with standardized interfaces, reducing dependency on a single\nsupplier. Conflict management in O-RAN refers to the process of identifying and\nresolving conflicts between network applications. xApps are applications\ndeployed at the RAN Intelligent Controller (RIC) that leverage advanced AI/ML\nalgorithms to make dynamic decisions for network optimization. The lack of a\nunified mechanism to coordinate and prioritize the actions of different\napplications can create three types of conflicts (direct, indirect, and\nimplicit). In our paper, we introduce a novel data-driven GCN-based method\ncalled Graph-based xApps Conflict and Root Cause Analysis Engine (GRACE) based\non Graph Convolutional Network (GCN). It detects three types of conflicts\n(direct, indirect, and implicit) and pinpoints the root causes (xApps). GRACE\ncaptures the complex and hidden dependencies among the xApps, the controlled\nparameters, and the KPIs in O-RAN to detect possible conflicts. Then, it\nidentifies the root causes (xApps) contributing to the detected conflicts. The\nproposed method was tested on highly imbalanced datasets where the number of\nconflict instances ranges from 40% to 10%. The model is tested in a setting\nthat simulates real-world scenarios where conflicts are rare to assess its\nperformance and generalizability. Experimental results demonstrate an\nexceptional performance, achieving a high F1-score greater than 98% for all the\ncase studies.\n","authors":["Maryam Al Shami","Jun Yan","Emmanuel Thepie Fapi"],"pdf_url":"https://arxiv.org/pdf/2503.03523v1.pdf","comment":"9 pages, 10 figures"},{"id":"http://arxiv.org/abs/2412.18180v3","updated":"2025-03-05T14:05:29Z","published":"2024-12-24T05:34:05Z","title":"PCM Selector: Penalized Covariate-Mediator Selection Operator for\n Evaluating Linear Causal Effects","summary":" For a data-generating process for random variables that can be described with\na linear structural equation model, we consider a situation in which (i) a set\nof covariates satisfying the back-door criterion cannot be observed or (ii)\nsuch a set can be observed, but standard statistical estimation methods cannot\nbe applied to estimate causal effects because of\nmulticollinearity/high-dimensional data problems. We propose a novel two-stage\npenalized regression approach, the penalized covariate-mediator selection\noperator (PCM Selector), to estimate the causal effects in such scenarios.\nUnlike existing penalized regression analyses, when a set of intermediate\nvariables is available, PCM Selector provides a consistent or less biased\nestimator of the causal effect. In addition, PCM Selector provides a variable\nselection procedure for intermediate variables to obtain better estimation\naccuracy of the causal effects than does the back-door criterion.\n","authors":["Hisayoshi Nanmo","Manabu Kuroki"],"pdf_url":"https://arxiv.org/pdf/2412.18180v3.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2503.01431v2","updated":"2025-03-05T14:04:46Z","published":"2025-03-03T11:34:27Z","title":"How simple can you go? An off-the-shelf transformer approach to\n molecular dynamics","summary":" Most current neural networks for molecular dynamics (MD) include physical\ninductive biases, resulting in specialized and complex architectures. This is\nin contrast to most other machine learning domains, where specialist approaches\nare increasingly replaced by general-purpose architectures trained on vast\ndatasets. In line with this trend, several recent studies have questioned the\nnecessity of architectural features commonly found in MD models, such as\nbuilt-in rotational equivariance or energy conservation. In this work, we\ncontribute to the ongoing discussion by evaluating the performance of an MD\nmodel with as few specialized architectural features as possible. We present a\nrecipe for MD using an Edge Transformer, an \"off-the-shelf'' transformer\narchitecture that has been minimally modified for the MD domain, termed MD-ET.\nOur model implements neither built-in equivariance nor energy conservation. We\nuse a simple supervised pre-training scheme on $\\sim$30 million molecular\nstructures from the QCML database. Using this \"off-the-shelf'' approach, we\nshow state-of-the-art results on several benchmarks after fine-tuning for a\nsmall number of steps. Additionally, we examine the effects of being only\napproximately equivariant and energy conserving for MD simulations, proposing a\nnovel method for distinguishing the errors resulting from non-equivariance from\nother sources of inaccuracies like numerical rounding errors. While our model\nexhibits runaway energy increases on larger structures, we show approximately\nenergy-conserving NVE simulations for a range of small structures.\n","authors":["Max Eissler","Tim Korjakow","Stefan Ganscha","Oliver T. Unke","Klaus-Robert Müller","Stefan Gugler"],"pdf_url":"https://arxiv.org/pdf/2503.01431v2.pdf","comment":"21 pages, code at https://github.com/mx-e/simple-md"},{"id":"http://arxiv.org/abs/2503.03515v1","updated":"2025-03-05T14:01:17Z","published":"2025-03-05T14:01:17Z","title":"DO-IQS: Dynamics-Aware Offline Inverse Q-Learning for Optimal Stopping\n with Unknown Gain Functions","summary":" We consider Inverse Optimal Stopping (IOS) problem where, based on stopped\nexpert trajectories, one aims to recover the optimal stopping region through\ncontinuation and stopping gain functions approximation. The uniqueness of the\nstopping region allows the use of IOS in real-world applications with safety\nconcerns. While current state-of-the-art inverse reinforcement learning methods\nrecover both a Q-function and the corresponding optimal policy, they fail to\naccount for specific challenges posed by optimal stopping problems. These\ninclude data sparsity near the stopping region, non-Markovian nature of the\ncontinuation gain, a proper treatment of boundary conditions, the need for a\nstable offline approach for risk-sensitive applications, and a lack of a\nquality evaluation metric. These challenges are addressed with the proposed\nDynamics-Aware Offline Inverse Q-Learning for Optimal Stopping (DO-IQS), which\nincorporates temporal information by approximating the cumulative continuation\ngain together with the world dynamics and the Q-function without querying to\nthe environment. Moreover, a confidence-based oversampling approach is proposed\nto treat the data sparsity problem. We demonstrate the performance of our\nmodels on real and artificial data including an optimal intervention for\ncritical events problem.\n","authors":["Anna Kuchko"],"pdf_url":"https://arxiv.org/pdf/2503.03515v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05459v2","updated":"2025-03-05T13:57:56Z","published":"2024-10-07T19:45:09Z","title":"From Sparse Dependence to Sparse Attention: Unveiling How\n Chain-of-Thought Enhances Transformer Sample Efficiency","summary":" Chain-of-thought (CoT) significantly enhances the reasoning performance of\nlarge language models (LLM). While current theoretical studies often attribute\nthis improvement to increased expressiveness and computational capacity, we\nargue that expressiveness is not the primary limitation in the LLM regime, as\ncurrent large models will fail on simple tasks. Using a parity-learning setup,\nwe demonstrate that CoT can substantially improve sample efficiency even when\nthe representation power is sufficient. Specifically, with CoT, a transformer\ncan learn the function within polynomial samples, whereas without CoT, the\nrequired sample size is exponential. Additionally, we show that CoT simplifies\nthe learning process by introducing sparse sequential dependencies among input\ntokens, and leads to a sparse and interpretable attention. We validate our\ntheoretical analysis with both synthetic and real-world experiments, confirming\nthat sparsity in attention layers is a key factor of the improvement induced by\nCoT.\n","authors":["Kaiyue Wen","Huaqing Zhang","Hongzhou Lin","Jingzhao Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.05459v2.pdf","comment":"43 pages,11 figures"},{"id":"http://arxiv.org/abs/2503.03512v1","updated":"2025-03-05T13:57:48Z","published":"2025-03-05T13:57:48Z","title":"An Aspect Extraction Framework using Different Embedding Types, Learning\n Models, and Dependency Structure","summary":" Aspect-based sentiment analysis has gained significant attention in recent\nyears due to its ability to provide fine-grained insights for sentiment\nexpressions related to specific features of entities. An important component of\naspect-based sentiment analysis is aspect extraction, which involves\nidentifying and extracting aspect terms from text. Effective aspect extraction\nserves as the foundation for accurate sentiment analysis at the aspect level.\nIn this paper, we propose aspect extraction models that use different types of\nembeddings for words and part-of-speech tags and that combine several learning\nmodels. We also propose tree positional encoding that is based on dependency\nparsing output to capture better the aspect positions in sentences. In\naddition, a new aspect extraction dataset is built for Turkish by machine\ntranslating an English dataset in a controlled setting. The experiments\nconducted on two Turkish datasets showed that the proposed models mostly\noutperform the studies that use the same datasets, and incorporating tree\npositional encoding increases the performance of the models.\n","authors":["Ali Erkan","Tunga Güngör"],"pdf_url":"https://arxiv.org/pdf/2503.03512v1.pdf","comment":"Aspect-based Sentiment Analysis, Aspect Extraction, Natural Language\n Processing, Machine Learning, Deep Neural Networks, Turkish"},{"id":"http://arxiv.org/abs/2503.03506v1","updated":"2025-03-05T13:54:13Z","published":"2025-03-05T13:54:13Z","title":"Rethinking Synthetic Data definitions: A privacy driven approach","summary":" Synthetic data is gaining traction as a cost-effective solution for the\nincreasing data demands of AI development and can be generated either from\nexisting knowledge or derived data captured from real-world events. The source\nof the synthetic data generation and the technique used significantly impacts\nits residual privacy risk and therefore its opportunity for sharing.\nTraditional classification of synthetic data types no longer fit the newer\ngeneration techniques and there is a need to better align the classification\nwith practical needs. We suggest a new way of grouping synthetic data types\nthat better supports privacy evaluations to aid regulatory policymaking. Our\nnovel classification provides flexibility to new advancements like deep\ngenerative methods and offers a more practical framework for future\napplications.\n","authors":["Vibeke Binz Vallevik","Serena Elizabeth Marshall","Aleksandar Babic","Jan Franz Nygaard"],"pdf_url":"https://arxiv.org/pdf/2503.03506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03503v1","updated":"2025-03-05T13:47:55Z","published":"2025-03-05T13:47:55Z","title":"Collaborative Expert LLMs Guided Multi-Objective Molecular Optimization","summary":" Molecular optimization is a crucial yet complex and time-intensive process\nthat often acts as a bottleneck for drug development. Traditional methods rely\nheavily on trial and error, making multi-objective optimization both\ntime-consuming and resource-intensive. Current AI-based methods have shown\nlimited success in handling multi-objective optimization tasks, hampering their\npractical utilization. To address this challenge, we present MultiMol, a\ncollaborative large language model (LLM) system designed to guide\nmulti-objective molecular optimization. MultiMol comprises two agents,\nincluding a data-driven worker agent and a literature-guided research agent.\nThe data-driven worker agent is a large language model being fine-tuned to\nlearn how to generate optimized molecules considering multiple objectives,\nwhile the literature-guided research agent is responsible for searching\ntask-related literature to find useful prior knowledge that facilitates\nidentifying the most promising optimized candidates. In evaluations across six\nmulti-objective optimization tasks, MultiMol significantly outperforms existing\nmethods, achieving a 82.30% success rate, in sharp contrast to the 27.50%\nsuccess rate of current strongest methods. To further validate its practical\nimpact, we tested MultiMol on two real-world challenges. First, we enhanced the\nselectivity of Xanthine Amine Congener (XAC), a promiscuous ligand that binds\nboth A1R and A2AR, successfully biasing it towards A1R. Second, we improved the\nbioavailability of Saquinavir, an HIV-1 protease inhibitor with known\nbioavailability limitations. Overall, these results indicate that MultiMol\nrepresents a highly promising approach for multi-objective molecular\noptimization, holding great potential to accelerate the drug development\nprocess and contribute to the advancement of pharmaceutical research.\n","authors":["Jiajun Yu","Yizhen Zheng","Huan Yee Koh","Shirui Pan","Tianyue Wang","Haishuai Wang"],"pdf_url":"https://arxiv.org/pdf/2503.03503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03499v1","updated":"2025-03-05T13:44:42Z","published":"2025-03-05T13:44:42Z","title":"State-offset Tuning: State-based Parameter-Efficient Fine-Tuning for\n State Space Models","summary":" State Space Models (SSMs) have emerged as efficient alternatives to\nTransformers, mitigating their quadratic computational cost. However, the\napplication of Parameter-Efficient Fine-Tuning (PEFT) methods to SSMs remains\nlargely unexplored. In particular, prompt-based methods like Prompt Tuning and\nPrefix-Tuning, which are widely used in Transformers, do not perform well on\nSSMs. To address this, we propose state-based methods as a superior alternative\nto prompt-based methods. This new family of methods naturally stems from the\narchitectural characteristics of SSMs. State-based methods adjust state-related\nfeatures directly instead of depending on external prompts. Furthermore, we\nintroduce a novel state-based PEFT method: State-offset Tuning. At every\ntimestep, our method directly affects the state at the current step, leading to\nmore effective adaptation. Through extensive experiments across diverse\ndatasets, we demonstrate the effectiveness of our method. Code is available at\nhttps://github.com/furiosa-ai/ssm-state-tuning.\n","authors":["Wonjun Kang","Kevin Galim","Yuchen Zeng","Minjae Lee","Hyung Il Koo","Nam Ik Cho"],"pdf_url":"https://arxiv.org/pdf/2503.03499v1.pdf","comment":"Code is available at https://github.com/furiosa-ai/ssm-state-tuning"},{"id":"http://arxiv.org/abs/2503.03489v1","updated":"2025-03-05T13:29:23Z","published":"2025-03-05T13:29:23Z","title":"Federated Learning for Predicting Mild Cognitive Impairment to Dementia\n Conversion","summary":" Dementia is a progressive condition that impairs an individual's cognitive\nhealth and daily functioning, with mild cognitive impairment (MCI) often\nserving as its precursor. The prediction of MCI to dementia conversion has been\nwell studied, but previous studies have almost always focused on traditional\nMachine Learning (ML) based methods that require sharing sensitive clinical\ninformation to train predictive models. This study proposes a privacy-enhancing\nsolution using Federated Learning (FL) to train predictive models for MCI to\ndementia conversion without sharing sensitive data, leveraging socio\ndemographic and cognitive measures. We simulated and compared two network\narchitectures, Peer to Peer (P2P) and client-server, to enable collaborative\nlearning. Our results demonstrated that FL had comparable predictive\nperformance to centralized ML, and each clinical site showed similar\nperformance without sharing local data. Moreover, the predictive performance of\nFL models was superior to site specific models trained without collaboration.\nThis work highlights that FL can eliminate the need for data sharing without\ncompromising model efficacy.\n","authors":["Gaurang Sharma","Elaheh Moradi","Juha Pajula","Mika Hilvo","Jussi Tohka"],"pdf_url":"https://arxiv.org/pdf/2503.03489v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2412.18355v2","updated":"2025-03-05T13:25:09Z","published":"2024-12-24T11:35:40Z","title":"Handling Spatial-Temporal Data Heterogeneity for Federated Continual\n Learning via Tail Anchor","summary":" Federated continual learning (FCL) allows each client to continually update\nits knowledge from task streams, enhancing the applicability of federated\nlearning in real-world scenarios. However, FCL needs to address not only\nspatial data heterogeneity between clients but also temporal data heterogeneity\nbetween tasks. In this paper, empirical experiments demonstrate that such\ninput-level heterogeneity significantly affects the model's internal parameters\nand outputs, leading to severe spatial-temporal catastrophic forgetting of\nlocal and previous knowledge. To this end, we propose Federated Tail Anchor\n(FedTA) to mix trainable Tail Anchor with the frozen output features to adjust\ntheir position in the feature space, thereby overcoming parameter-forgetting\nand output-forgetting. Three novel components are also included: Input\nEnhancement for improving the performance of pre-trained models on downstream\ntasks; Selective Input Knowledge Fusion for fusion of heterogeneous local\nknowledge on the server; and Best Global Prototype Selection for finding the\nbest anchor point for each class in the feature space. Extensive experiments\ndemonstrate that FedTA not only outperforms existing FCL methods but also\neffectively preserves the relative positions of features.\n","authors":["Hao Yu","Xin Yang","Le Zhang","Hanlin Gu","Tianrui Li","Lixin Fan","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2412.18355v2.pdf","comment":"This paper is accepted by CVPR 2025"},{"id":"http://arxiv.org/abs/2503.03486v1","updated":"2025-03-05T13:24:58Z","published":"2025-03-05T13:24:58Z","title":"Differentially Private Learners for Heterogeneous Treatment Effects","summary":" Patient data is widely used to estimate heterogeneous treatment effects and\nthus understand the effectiveness and safety of drugs. Yet, patient data\nincludes highly sensitive information that must be kept private. In this work,\nwe aim to estimate the conditional average treatment effect (CATE) from\nobservational data under differential privacy. Specifically, we present\nDP-CATE, a novel framework for CATE estimation that is Neyman-orthogonal and\nfurther ensures differential privacy of the estimates. Our framework is highly\ngeneral: it applies to any two-stage CATE meta-learner with a Neyman-orthogonal\nloss function, and any machine learning model can be used for nuisance\nestimation. We further provide an extension of our DP-CATE, where we employ\nRKHS regression to release the complete CATE function while ensuring\ndifferential privacy. We demonstrate our DP-CATE across various experiments\nusing synthetic and real-world datasets. To the best of our knowledge, we are\nthe first to provide a framework for CATE estimation that is Neyman-orthogonal\nand differentially private.\n","authors":["Maresa Schröder","Valentyn Melnychuk","Stefan Feuerriegel"],"pdf_url":"https://arxiv.org/pdf/2503.03486v1.pdf","comment":"Published at ICLR 2025"},{"id":"http://arxiv.org/abs/2503.03485v1","updated":"2025-03-05T13:24:57Z","published":"2025-03-05T13:24:57Z","title":"TEDDY: A Family Of Foundation Models For Understanding Single Cell\n Biology","summary":" Understanding the biological mechanism of disease is critical for medicine,\nand in particular drug discovery. AI-powered analysis of genome-scale\nbiological data hold great potential in this regard. The increasing\navailability of single-cell RNA sequencing data has enabled the development of\nlarge foundation models for disease biology. However, existing foundation\nmodels either do not improve or only modestly improve over task-specific models\nin downstream applications. Here, we explored two avenues for improving the\nstate-of-the-art. First, we scaled the pre-training dataset to 116 million\ncells, which is larger than those used by previous models. Second, we leveraged\nthe availability of large-scale biological annotations as a form of supervision\nduring pre-training. We trained the TEDDY family of models comprising six\ntransformer-based state-of-the-art single-cell foundation models with 70\nmillion, 160 million, and 400 million parameters. We vetted our models on two\ndownstream evaluation tasks -- identifying the underlying disease state of\nheld-out donors not seen during training and distinguishing healthy cells from\ndiseased ones for disease conditions and donors not seen during training.\nScaling experiments showed that performance improved predictably with both data\nvolume and parameter count. Our models showed substantial improvement over\nexisting work on the first task and more muted improvements on the second.\n","authors":["Alexis Chevalier","Soumya Ghosh","Urvi Awasthi","James Watkins","Julia Bieniewska","Nichita Mitrea","Olga Kotova","Kirill Shkura","Andrew Noble","Michael Steinbaugh","Julien Delile","Christoph Meier","Leonid Zhukov","Iya Khalil","Srayanta Mukherjee","Judith Mueller"],"pdf_url":"https://arxiv.org/pdf/2503.03485v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.20475v2","updated":"2025-03-05T13:22:47Z","published":"2025-02-27T19:23:15Z","title":"Promote, Suppress, Iterate: How Language Models Answer One-to-Many\n Factual Queries","summary":" To answer one-to-many factual queries (e.g., listing cities of a country), a\nlanguage model (LM) must simultaneously recall knowledge and avoid repeating\nprevious answers. How are these two subtasks implemented and integrated\ninternally? Across multiple datasets and models, we identify a\npromote-then-suppress mechanism: the model first recalls all answers, and then\nsuppresses previously generated ones. Specifically, LMs use both the subject\nand previous answer tokens to perform knowledge recall, with attention\npropagating subject information and MLPs promoting the answers. Then, attention\nattends to and suppresses previous answer tokens, while MLPs amplify the\nsuppression signal. Our mechanism is corroborated by extensive experimental\nevidence: in addition to using early decoding and causal tracing, we analyze\nhow components use different tokens by introducing both Token Lens, which\ndecodes aggregated attention updates from specified tokens, and a knockout\nmethod that analyzes changes in MLP outputs after removing attention to\nspecified tokens. Overall, we provide new insights into how LMs' internal\ncomponents interact with different input tokens to support complex factual\nrecall. Code is available at\nhttps://github.com/Lorenayannnnn/how-lms-answer-one-to-many-factual-queries.\n","authors":["Tianyi Lorena Yan","Robin Jia"],"pdf_url":"https://arxiv.org/pdf/2502.20475v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02393v3","updated":"2025-03-05T13:19:16Z","published":"2025-01-04T22:30:21Z","title":"Graph-Aware Isomorphic Attention for Adaptive Dynamics in Transformers","summary":" We present an approach to modifying Transformer architectures by integrating\ngraph-aware relational reasoning into the attention mechanism, merging concepts\nfrom graph neural networks and language modeling. Building on the inherent\nconnection between attention and graph theory, we reformulate the Transformer's\nattention mechanism as a graph operation and propose Graph-Aware Isomorphic\nAttention. This method leverages advanced graph modeling strategies, including\nGraph Isomorphism Networks (GIN) and Principal Neighborhood Aggregation (PNA),\nto enrich the representation of relational structures. Our approach captures\ncomplex dependencies and generalizes across tasks, as evidenced by a reduced\ngeneralization gap and improved learning performance. Additionally, we expand\nthe concept of graph-aware attention to introduce Sparse GIN-Attention, a\nfine-tuning approach that employs sparse GINs. By interpreting attention\nmatrices as sparse adjacency graphs, this technique enhances the adaptability\nof pre-trained foundational models with minimal computational overhead,\nendowing them with graph-aware capabilities. Sparse GIN-Attention fine-tuning\nachieves improved training dynamics and better generalization compared to\nalternative methods like low-rank adaption (LoRA). We discuss latent graph-like\nstructures within traditional attention mechanisms, offering a new lens through\nwhich Transformers can be understood. By evolving Transformers as hierarchical\nGIN models for relational reasoning. This perspective suggests profound\nimplications for foundational model development, enabling the design of\narchitectures that dynamically adapt to both local and global dependencies.\nApplications in bioinformatics, materials science, language modeling, and\nbeyond could benefit from this synthesis of relational and sequential data\nmodeling, setting the stage for interpretable and generalizable modeling\nstrategies.\n","authors":["Markus J. Buehler"],"pdf_url":"https://arxiv.org/pdf/2501.02393v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.14848v2","updated":"2025-03-05T12:52:30Z","published":"2023-10-23T12:15:23Z","title":"Zero-Knowledge Proof-based Verifiable Decentralized Machine Learning in\n Communication Network: A Comprehensive Survey","summary":" Over recent decades, machine learning has significantly advanced network\ncommunication, enabling improved decision-making, user behavior analysis, and\nfault detection. Decentralized approaches, where participants exchange\ncomputation results instead of raw private data, mitigate these risks but\nintroduce challenges related to trust and verifiability. A critical issue\narises: How can one ensure the integrity and validity of computation results\nshared by other participants? Existing survey articles predominantly address\nsecurity and privacy concerns in decentralized machine learning, whereas this\nsurvey uniquely highlights the emerging issue of verifiability. Recognizing the\ncritical role of zero-knowledge proofs in ensuring verifiability, we present a\ncomprehensive review of Zero-Knowledge Proof-based Verifiable Machine Learning\n(ZKP-VML). To clarify the research problem, we present a definition of ZKP-VML\nconsisting of four algorithms, along with several corresponding key security\nproperties. Besides, we provide an overview of the current research landscape\nby systematically organizing the research timeline and categorizing existing\nschemes based on their security properties. Furthermore, through an in-depth\nanalysis of each existing scheme, we summarize their technical contributions\nand optimization strategies, aiming to uncover common design principles\nunderlying ZKP-VML schemes. Building on the reviews and analysis presented, we\nidentify current research challenges and suggest future research directions. To\nthe best of our knowledge, this is the most comprehensive survey to date on\nverifiable decentralized machine learning and ZKP-VML.\n","authors":["Zhibo Xing","Zijian Zhang","Ziang Zhang","Zhen Li","Meng Li","Jiamou Liu","Zongyang Zhang","Yi Zhao","Qi Sun","Liehuang Zhu","Giovanni Russello"],"pdf_url":"https://arxiv.org/pdf/2310.14848v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03462v1","updated":"2025-03-05T12:52:14Z","published":"2025-03-05T12:52:14Z","title":"Open-Source Large Language Models as Multilingual Crowdworkers:\n Synthesizing Open-Domain Dialogues in Several Languages With No Examples in\n Targets and No Machine Translation","summary":" The prevailing paradigm in the domain of Open-Domain Dialogue agents\npredominantly focuses on the English language, encompassing both models and\ndatasets. Furthermore, the financial and temporal investments required for\ncrowdsourcing such datasets for finetuning are substantial, particularly when\nmultiple languages are involved. Fortunately, advancements in Large Language\nModels (LLMs) have unveiled a plethora of possibilities across diverse tasks.\nSpecifically, instruction-tuning has enabled LLMs to execute tasks based on\nnatural language instructions, occasionally surpassing the performance of human\ncrowdworkers. Additionally, these models possess the capability to function in\nvarious languages within a single thread. Consequently, to generate new samples\nin different languages, we propose leveraging these capabilities to replicate\nthe data collection process. We introduce a pipeline for generating Open-Domain\nDialogue data in multiple Target Languages using LLMs, with demonstrations\nprovided in a unique Source Language. By eschewing explicit Machine Translation\nin this approach, we enhance the adherence to language-specific nuances. We\napply this methodology to the PersonaChat dataset. To enhance the openness of\ngenerated dialogues and mimic real life scenarii, we added the notion of speech\nevents corresponding to the type of conversation the speakers are involved in\nand also that of common ground which represents the premises of a conversation.\n","authors":["Ahmed Njifenjou","Virgile Sucal","Bassam Jabaian","Fabrice Lefèvre"],"pdf_url":"https://arxiv.org/pdf/2503.03462v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03454v1","updated":"2025-03-05T12:40:34Z","published":"2025-03-05T12:40:34Z","title":"Data Poisoning Attacks to Locally Differentially Private Range Query\n Protocols","summary":" Trajectory data, which tracks movements through geographic locations, is\ncrucial for improving real-world applications. However, collecting such\nsensitive data raises considerable privacy concerns. Local differential privacy\n(LDP) offers a solution by allowing individuals to locally perturb their\ntrajectory data before sharing it. Despite its privacy benefits, LDP protocols\nare vulnerable to data poisoning attacks, where attackers inject fake data to\nmanipulate aggregated results. In this work, we make the first attempt to\nanalyze vulnerabilities in several representative LDP trajectory protocols. We\npropose \\textsc{TraP}, a heuristic algorithm for data \\underline{P}oisoning\nattacks using a prefix-suffix method to optimize fake \\underline{Tra}jectory\nselection, significantly reducing computational complexity. Our experimental\nresults demonstrate that our attack can substantially increase target pattern\noccurrences in the perturbed trajectory dataset with few fake users. This study\nunderscores the urgent need for robust defenses and better protocol designs to\nsafeguard LDP trajectory data against malicious manipulation.\n","authors":["I-Jung Hsu","Chih-Hsun Lin","Chia-Mu Yu","Sy-Yen Kuo","Chun-Ying Huang"],"pdf_url":"https://arxiv.org/pdf/2503.03454v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03443v1","updated":"2025-03-05T12:24:12Z","published":"2025-03-05T12:24:12Z","title":"Conceptualizing Uncertainty","summary":" Uncertainty in machine learning refers to the degree of confidence or lack\nthereof in a model's predictions. While uncertainty quantification methods\nexist, explanations of uncertainty, especially in high-dimensional settings,\nremain an open challenge. Existing work focuses on feature attribution\napproaches which are restricted to local explanations. Understanding\nuncertainty, its origins, and characteristics on a global scale is crucial for\nenhancing interpretability and trust in a model's predictions. In this work, we\npropose to explain the uncertainty in high-dimensional data classification\nsettings by means of concept activation vectors which give rise to local and\nglobal explanations of uncertainty. We demonstrate the utility of the generated\nexplanations by leveraging them to refine and improve our model.\n","authors":["Isaac Roberts","Alexander Schulz","Sarah Schroeder","Fabian Hinder","Barbara Hammer"],"pdf_url":"https://arxiv.org/pdf/2503.03443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03438v1","updated":"2025-03-05T12:13:08Z","published":"2025-03-05T12:13:08Z","title":"Gradient Deconfliction via Orthogonal Projections onto Subspaces For\n Multi-task Learning","summary":" Although multi-task learning (MTL) has been a preferred approach and\nsuccessfully applied in many real-world scenarios, MTL models are not\nguaranteed to outperform single-task models on all tasks mainly due to the\nnegative effects of conflicting gradients among the tasks. In this paper, we\nfully examine the influence of conflicting gradients and further emphasize the\nimportance and advantages of achieving non-conflicting gradients which allows\nsimple but effective trade-off strategies among the tasks with stable\nperformance. Based on our findings, we propose the Gradient Deconfliction via\nOrthogonal Projections onto Subspaces (GradOPS) spanned by other task-specific\ngradients. Our method not only solves all conflicts among the tasks, but can\nalso effectively search for diverse solutions towards different trade-off\npreferences among the tasks. Theoretical analysis on convergence is provided,\nand performance of our algorithm is fully testified on multiple benchmarks in\nvarious domains. Results demonstrate that our method can effectively find\nmultiple state-of-the-art solutions with different trade-off strategies among\nthe tasks on multiple datasets.\n","authors":["Shijie Zhu","Hui Zhao","Tianshu Wu","Pengjie Wang","Hongbo Deng","Jian Xu","Bo Zheng"],"pdf_url":"https://arxiv.org/pdf/2503.03438v1.pdf","comment":"WSDM 2025"},{"id":"http://arxiv.org/abs/2503.00578v2","updated":"2025-03-05T12:00:38Z","published":"2025-03-01T18:00:41Z","title":"Channel-Attentive Graph Neural Networks","summary":" Graph Neural Networks (GNNs) set the state-of-the-art in representation\nlearning for graph-structured data. They are used in many domains, from online\nsocial networks to complex molecules. Most GNNs leverage the message-passing\nparadigm and achieve strong performances on various tasks. However, the\nmessage-passing mechanism used in most models suffers from over-smoothing as a\nGNN's depth increases. The over-smoothing degrades GNN's performance due to the\nincreased similarity between the representations of unrelated nodes. This study\nproposes an adaptive channel-wise message-passing approach to alleviate the\nover-smoothing. The proposed model, Channel-Attentive GNN, learns how to attend\nto neighboring nodes and their feature channels. Thus, much diverse information\ncan be transferred between nodes during message-passing. Experiments with\nwidely used benchmark datasets show that the proposed model is more resistant\nto over-smoothing than baselines and achieves state-of-the-art performances for\nvarious graphs with strong heterophily. Our code is at\nhttps://github.com/ALLab-Boun/CHAT-GNN.\n","authors":["Tuğrul Hasan Karabulut","İnci M. Baytaş"],"pdf_url":"https://arxiv.org/pdf/2503.00578v2.pdf","comment":"Published as a conference paper at IEEE International Conference on\n Data Mining 2024"},{"id":"http://arxiv.org/abs/2503.03426v1","updated":"2025-03-05T11:59:31Z","published":"2025-03-05T11:59:31Z","title":"Early-Stopped Mirror Descent for Linear Regression over Convex Bodies","summary":" Early-stopped iterative optimization methods are widely used as alternatives\nto explicit regularization, and direct comparisons between early-stopping and\nexplicit regularization have been established for many optimization geometries.\nHowever, most analyses depend heavily on the specific properties of the\noptimization geometry or strong convexity of the empirical objective, and it\nremains unclear whether early-stopping could ever be less statistically\nefficient than explicit regularization for some particular shape constraint,\nespecially in the overparameterized regime. To address this question, we study\nthe setting of high-dimensional linear regression under additive Gaussian noise\nwhen the ground truth is assumed to lie in a known convex body and the task is\nto minimize the in-sample mean squared error. Our main result shows that for\nany convex body and any design matrix, up to an absolute constant factor, the\nworst-case risk of unconstrained early-stopped mirror descent with an\nappropriate potential is at most that of the least squares estimator\nconstrained to the convex body. We achieve this by constructing algorithmic\nregularizers based on the Minkowski functional of the convex body.\n","authors":["Tobias Wegel","Gil Kur","Patrick Rebeschini"],"pdf_url":"https://arxiv.org/pdf/2503.03426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.00735v3","updated":"2025-03-05T11:50:24Z","published":"2025-03-02T05:16:43Z","title":"LADDER: Self-Improving LLMs Through Recursive Problem Decomposition","summary":" We introduce LADDER (Learning through Autonomous Difficulty-Driven Example\nRecursion), a framework which enables Large Language Models to autonomously\nimprove their problem-solving capabilities through self-guided learning by\nrecursively generating and solving progressively simpler variants of complex\nproblems. Unlike prior approaches that require curated datasets or human\nfeedback, LADDER leverages a model's own capabilities to generate easier\nquestion variants. We demonstrate LADDER's effectiveness in the subject of\nmathematical integration, improving Llama 3.2 3B's accuracy from 1% to 82% on\nundergraduate-level problems and enabling Qwen2.5 7B Deepseek-R1 Distilled to\nachieve 73% on the MIT Integration Bee qualifying examination. We also\nintroduce TTRL (Test-Time Reinforcement Learning), where we perform\nreinforcement learning on variants of test problems at inference time. TTRL\nenables Qwen2.5 7B Deepseek-R1 Distilled to achieve a state-of-the-art score of\n90% on the MIT Integration Bee qualifying examination, surpassing OpenAI o1's\nperformance. These results show how self-directed strategic learning can\nachieve significant capability improvements without relying on architectural\nscaling or human supervision.\n","authors":["Toby Simonds","Akira Yoshiyama"],"pdf_url":"https://arxiv.org/pdf/2503.00735v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.18377v3","updated":"2025-03-05T11:49:36Z","published":"2024-12-24T12:03:36Z","title":"ChaI-TeA: A Benchmark for Evaluating Autocompletion of Interactions with\n LLM-based Chatbots","summary":" The rise of LLMs has deflected a growing portion of human-computer\ninteractions towards LLM-based chatbots. The remarkable abilities of these\nmodels allow users to interact using long, diverse natural language text\ncovering a wide range of topics and styles. Phrasing these messages is a time\nand effort consuming task, calling for an autocomplete solution to assist\nusers. We introduce the task of chatbot interaction autocomplete. We present\nChaI-TeA: CHat InTEraction Autocomplete; An autcomplete evaluation framework\nfor LLM-based chatbot interactions. The framework includes a formal definition\nof the task, coupled with suitable datasets and metrics. We use the framework\nto evaluate After formally defining the task along with suitable datasets and\nmetrics, we test 9 models on the defined auto completion task, finding that\nwhile current off-the-shelf models perform fairly, there is still much room for\nimprovement, mainly in ranking of the generated suggestions. We provide\ninsights for practitioners working on this task and open new research\ndirections for researchers in the field. We release our framework to serve as a\nfoundation for future research.\n","authors":["Shani Goren","Oren Kalinsky","Tomer Stav","Yuri Rapoport","Yaron Fairstein","Ram Yazdi","Nachshon Cohen","Alexander Libov","Guy Kushilevitz"],"pdf_url":"https://arxiv.org/pdf/2412.18377v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03418v1","updated":"2025-03-05T11:47:41Z","published":"2025-03-05T11:47:41Z","title":"Simplicial SMOTE: Oversampling Solution to the Imbalanced Learning\n Problem","summary":" SMOTE (Synthetic Minority Oversampling Technique) is the established\ngeometric approach to random oversampling to balance classes in the imbalanced\nlearning problem, followed by many extensions. Its idea is to introduce\nsynthetic data points of the minor class, with each new point being the convex\ncombination of an existing data point and one of its k-nearest neighbors. In\nthis paper, by viewing SMOTE as sampling from the edges of a geometric\nneighborhood graph and borrowing tools from the topological data analysis, we\npropose a novel technique, Simplicial SMOTE, that samples from the simplices of\na geometric neighborhood simplicial complex. A new synthetic point is defined\nby the barycentric coordinates w.r.t. a simplex spanned by an arbitrary number\nof data points being sufficiently close rather than a pair. Such a replacement\nof the geometric data model results in better coverage of the underlying data\ndistribution compared to existing geometric sampling methods and allows the\ngeneration of synthetic points of the minority class closer to the majority\nclass on the decision boundary. We experimentally demonstrate that our\nSimplicial SMOTE outperforms several popular geometric sampling methods,\nincluding the original SMOTE. Moreover, we show that simplicial sampling can be\neasily integrated into existing SMOTE extensions. We generalize and evaluate\nsimplicial extensions of the classic Borderline SMOTE, Safe-level SMOTE, and\nADASYN algorithms, all of which outperform their graph-based counterparts.\n","authors":["Oleg Kachan","Andrey Savchenko","Gleb Gusev"],"pdf_url":"https://arxiv.org/pdf/2503.03418v1.pdf","comment":"Accepted at KDD 2025 (research track)"},{"id":"http://arxiv.org/abs/2412.14566v2","updated":"2025-03-05T11:38:00Z","published":"2024-12-19T06:35:54Z","title":"AIArena: A Blockchain-Based Decentralized AI Training Platform","summary":" The rapid advancement of AI has underscored critical challenges in its\ndevelopment and implementation, largely due to centralized control by a few\nmajor corporations. This concentration of power intensifies biases within AI\nmodels, resulting from inadequate governance and oversight mechanisms.\nAdditionally, it limits public involvement and heightens concerns about the\nintegrity of model generation. Such monopolistic control over data and AI\noutputs threatens both innovation and fair data usage, as users inadvertently\ncontribute data that primarily benefits these corporations. In this work, we\npropose AIArena, a blockchain-based decentralized AI training platform designed\nto democratize AI development and alignment through on-chain incentive\nmechanisms. AIArena fosters an open and collaborative environment where\nparticipants can contribute models and computing resources. Its on-chain\nconsensus mechanism ensures fair rewards for participants based on their\ncontributions. We instantiate and implement AIArena on the public Base\nblockchain Sepolia testnet, and the evaluation results demonstrate the\nfeasibility of AIArena in real-world applications.\n","authors":["Zhipeng Wang","Rui Sun","Elizabeth Lui","Tuo Zhou","Yizhe Wen","Jiahao Sun"],"pdf_url":"https://arxiv.org/pdf/2412.14566v2.pdf","comment":"Camera ready version. Accepted by the ACM Web Conference (WWW), 2025"},{"id":"http://arxiv.org/abs/2408.09838v2","updated":"2025-03-05T11:27:17Z","published":"2024-08-19T09:33:31Z","title":"Mitigating the Stability-Plasticity Dilemma in Adaptive Train Scheduling\n with Curriculum-Driven Continual DQN Expansion","summary":" A continual learning agent builds on previous experiences to develop\nincreasingly complex behaviors by adapting to non-stationary and dynamic\nenvironments while preserving previously acquired knowledge. However, scaling\nthese systems presents significant challenges, particularly in balancing the\npreservation of previous policies with the adaptation of new ones to current\nenvironments. This balance, known as the stability-plasticity dilemma, is\nespecially pronounced in complex multi-agent domains such as the train\nscheduling problem, where environmental and agent behaviors are constantly\nchanging, and the search space is vast. In this work, we propose addressing\nthese challenges in the train scheduling problem using curriculum learning. We\ndesign a curriculum with adjacent skills that build on each other to improve\ngeneralization performance. Introducing a curriculum with distinct tasks\nintroduces non-stationarity, which we address by proposing a new algorithm:\nContinual Deep Q-Network (DQN) Expansion (CDE). Our approach dynamically\ngenerates and adjusts Q-function subspaces to handle environmental changes and\ntask requirements. CDE mitigates catastrophic forgetting through EWC while\nensuring high plasticity using adaptive rational activation functions.\nExperimental results demonstrate significant improvements in learning\nefficiency and adaptability compared to RL baselines and other adapted methods\nfor continual learning, highlighting the potential of our method in managing\nthe stability-plasticity dilemma in the adaptive train scheduling setting.\n","authors":["Achref Jaziri","Etienne Künzel","Visvanathan Ramesh"],"pdf_url":"https://arxiv.org/pdf/2408.09838v2.pdf","comment":"9 Pages, 2 Figures"},{"id":"http://arxiv.org/abs/2503.03401v1","updated":"2025-03-05T11:24:55Z","published":"2025-03-05T11:24:55Z","title":"Evolutionary Prediction Games","summary":" When users decide whether to use a system based on the quality of predictions\nthey receive, learning has the capacity to shape the population of users it\nserves - for better or worse. This work aims to study the long-term\nimplications of this process through the lens of evolutionary game theory. We\nintroduce and study evolutionary prediction games, designed to capture the role\nof learning as a driver of natural selection between groups of users, and hence\na determinant of evolutionary outcomes. Our main theoretical results show that:\n(i) in settings with unlimited data and compute, learning tends to reinforce\nthe survival of the fittest, and (ii) in more realistic settings, opportunities\nfor coexistence emerge. We analyze these opportunities in terms of their\nstability and feasibility, present several mechanisms that can sustain their\nexistence, and empirically demonstrate our findings using real and synthetic\ndata.\n","authors":["Eden Saig","Nir Rosenfeld"],"pdf_url":"https://arxiv.org/pdf/2503.03401v1.pdf","comment":"Comments are welcome"},{"id":"http://arxiv.org/abs/2503.03399v1","updated":"2025-03-05T11:21:37Z","published":"2025-03-05T11:21:37Z","title":"Predicting Practically? Domain Generalization for Predictive Analytics\n in Real-world Environments","summary":" Predictive machine learning models are widely used in customer relationship\nmanagement (CRM) to forecast customer behaviors and support decision-making.\nHowever, the dynamic nature of customer behaviors often results in significant\ndistribution shifts between training data and serving data, leading to\nperformance degradation in predictive models. Domain generalization, which aims\nto train models that can generalize to unseen environments without prior\nknowledge of their distributions, has become a critical area of research. In\nthis work, we propose a novel domain generalization method tailored to handle\ncomplex distribution shifts, encompassing both covariate and concept shifts.\nOur method builds upon the Distributionally Robust Optimization framework,\noptimizing model performance over a set of hypothetical worst-case\ndistributions rather than relying solely on the training data. Through\nsimulation experiments, we demonstrate the working mechanism of the proposed\nmethod. We also conduct experiments on a real-world customer churn dataset, and\nvalidate its effectiveness in both temporal and spatial generalization\nsettings. Finally, we discuss the broader implications of our method for\nadvancing Information Systems (IS) design research, particularly in building\nrobust predictive models for dynamic managerial environments.\n","authors":["Hanyu Duan","Yi Yang","Ahmed Abbasi","Kar Yan Tam"],"pdf_url":"https://arxiv.org/pdf/2503.03399v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12972v2","updated":"2025-03-05T11:18:41Z","published":"2024-11-20T01:54:52Z","title":"UniFlow: A Foundation Model for Unified Urban Spatio-Temporal Flow\n Prediction","summary":" Urban spatio-temporal flow prediction, encompassing traffic flows and crowd\nflows, is crucial for optimizing city infrastructure and managing traffic and\nemergency responses. Traditional approaches have relied on separate models\ntailored to either grid-based data, representing cities as uniform cells, or\ngraph-based data, modeling cities as networks of nodes and edges. In this\npaper, we build UniFlow, a foundational model for general urban flow prediction\nthat unifies both grid-based and graphbased data. We first design a multi-view\nspatio-temporal patching mechanism to standardize different data into a\nconsistent sequential format and then introduce a spatio-temporal transformer\narchitecture to capture complex correlations and dynamics. To leverage shared\nspatio-temporal patterns across different data types and facilitate effective\ncross-learning, we propose SpatioTemporal Memory Retrieval Augmentation\n(ST-MRA). By creating structured memory modules to store shared spatio-temporal\npatterns, ST-MRA enhances predictions through adaptive memory retrieval.\nExtensive experiments demonstrate that UniFlow outperforms existing models in\nboth grid-based and graph-based flow prediction, excelling particularly in\nscenarios with limited data availability, showcasing its superior performance\nand broad applicability. The datasets and code implementation have been\nreleased on https://github.com/YuanYuan98/UniFlow.\n","authors":["Yuan Yuan","Jingtao Ding","Chonghua Han","Depeng Jin","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2411.12972v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03391v1","updated":"2025-03-05T11:12:40Z","published":"2025-03-05T11:12:40Z","title":"Multi-Agent DRL for Queue-Aware Task Offloading in Hierarchical\n MEC-Enabled Air-Ground Networks","summary":" Mobile edge computing (MEC)-enabled air-ground networks are a key component\nof 6G, employing aerial base stations (ABSs) such as unmanned aerial vehicles\n(UAVs) and high-altitude platform stations (HAPS) to provide dynamic services\nto ground IoT devices (IoTDs). These IoTDs support real-time applications\n(e.g., multimedia and Metaverse services) that demand high computational\nresources and strict quality of service (QoS) guarantees in terms of latency\nand task queue management. Given their limited energy and processing\ncapabilities, IoTDs rely on UAVs and HAPS to offload tasks for distributed\nprocessing, forming a multi-tier MEC system. This paper tackles the overall\nenergy minimization problem in MEC-enabled air-ground integrated networks\n(MAGIN) by jointly optimizing UAV trajectories, computing resource allocation,\nand queue-aware task offloading decisions. The optimization is challenging due\nto the nonconvex, nonlinear nature of this hierarchical system, which renders\ntraditional methods ineffective. We reformulate the problem as a multi-agent\nMarkov decision process (MDP) with continuous action spaces and heterogeneous\nagents, and propose a novel variant of multi-agent proximal policy optimization\nwith a Beta distribution (MAPPO-BD) to solve it. Extensive simulations show\nthat MAPPO-BD outperforms baseline schemes, achieving superior energy savings\nand efficient resource management in MAGIN while meeting queue delay and edge\ncomputing constraints.\n","authors":["Muhammet Hevesli","Abegaz Mohammed Seid","Aiman Erbad","Mohamed Abdallah"],"pdf_url":"https://arxiv.org/pdf/2503.03391v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.19166v2","updated":"2025-03-05T11:09:06Z","published":"2025-02-26T14:19:49Z","title":"CodeIF: Benchmarking the Instruction-Following Capabilities of Large\n Language Models for Code Generation","summary":" With the rapid advancement of Large Language Models (LLMs), the demand for\nrobust instruction-following capabilities in code generation tasks has grown\nsignificantly. Code generation not only facilitates faster prototyping and\nautomated testing, but also augments developer efficiency through improved\nmaintainability and reusability of code. In this paper, we introduce CodeIF,\nthe first benchmark specifically designed to assess the abilities of LLMs to\nadhere to task-oriented instructions within diverse code generation scenarios.\nCodeIF encompasses a broad range of tasks, including function synthesis, error\ndebugging, algorithmic refactoring, and code explanation, thereby providing a\ncomprehensive suite to evaluate model performance across varying complexity\nlevels and programming domains. We conduct extensive experiments with LLMs,\nanalyzing their strengths and limitations in meeting the demands of these\ntasks. The experimental results offer valuable insights into how well current\nmodels align with human instructions, as well as the extent to which they can\ngenerate consistent, maintainable, and contextually relevant code. Our findings\nnot only underscore the critical role that instruction-following LLMs can play\nin modern software development, but also illuminate pathways for future\nresearch aimed at enhancing their adaptability, reliability, and overall\neffectiveness in automated code generation.\n","authors":["Kaiwen Yan","Hongcheng Guo","Xuanqing Shi","Jingyi Xu","Yaonan Gu","Zhoujun Li"],"pdf_url":"https://arxiv.org/pdf/2502.19166v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.09453v2","updated":"2025-03-05T11:04:58Z","published":"2022-06-19T17:13:58Z","title":"Bounding Evidence and Estimating Log-Likelihood in VAE","summary":" Many crucial problems in deep learning and statistical inference are caused\nby a variational gap, i.e., a difference between model evidence\n(log-likelihood) and evidence lower bound (ELBO). In particular, in a classical\nVAE setting that involves training via an ELBO cost function, it is difficult\nto provide a robust comparison of the effects of training between models, since\nwe do not know a log-likelihood of data (but only its lower bound). In this\npaper, to deal with this problem, we introduce a general and effective upper\nbound, which allows us to efficiently approximate the evidence of data. We\nprovide extensive theoretical and experimental studies of our approach,\nincluding its comparison to the other state-of-the-art upper bounds, as well as\nits application as a tool for the evaluation of models that were trained on\nvarious lower bounds.\n","authors":["Łukasz Struski","Marcin Mazur","Paweł Batorski","Przemysław Spurek","Jacek Tabor"],"pdf_url":"https://arxiv.org/pdf/2206.09453v2.pdf","comment":"Paper accepted for AISTATS 2023"},{"id":"http://arxiv.org/abs/2503.03384v1","updated":"2025-03-05T11:02:29Z","published":"2025-03-05T11:02:29Z","title":"GNNMerge: Merging of GNN Models Without Accessing Training Data","summary":" Model merging has gained prominence in machine learning as a method to\nintegrate multiple trained models into a single model without accessing the\noriginal training data. While existing approaches have demonstrated success in\ndomains such as computer vision and NLP, their application to Graph Neural\nNetworks (GNNs) remains unexplored. These methods often rely on the assumption\nof shared initialization, which is seldom applicable to GNNs. In this work, we\nundertake the first benchmarking study of model merging algorithms for GNNs,\nrevealing their limited effectiveness in this context. To address these\nchallenges, we propose GNNMerge, which utilizes a task-agnostic node embedding\nalignment strategy to merge GNNs. Furthermore, we establish that under a mild\nrelaxation, the proposed optimization objective admits direct analytical\nsolutions for widely used GNN architectures, significantly enhancing its\ncomputational efficiency. Empirical evaluations across diverse datasets, tasks,\nand architectures establish GNNMerge to be up to 24% more accurate than\nexisting methods while delivering over 2 orders of magnitude speed-up compared\nto training from scratch.\n","authors":["Vipul Garg","Ishita Thakre","Sayan Ranu"],"pdf_url":"https://arxiv.org/pdf/2503.03384v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03382v1","updated":"2025-03-05T10:57:34Z","published":"2025-03-05T10:57:34Z","title":"Paths and Ambient Spaces in Neural Loss Landscapes","summary":" Understanding the structure of neural network loss surfaces, particularly the\nemergence of low-loss tunnels, is critical for advancing neural network theory\nand practice. In this paper, we propose a novel approach to directly embed loss\ntunnels into the loss landscape of neural networks. Exploring the properties of\nthese loss tunnels offers new insights into their length and structure and\nsheds light on some common misconceptions. We then apply our approach to\nBayesian neural networks, where we improve subspace inference by identifying\npitfalls and proposing a more natural prior that better guides the sampling\nprocedure.\n","authors":["Daniel Dold","Julius Kobialka","Nicolai Palm","Emanuel Sommer","David Rügamer","Oliver Dürr"],"pdf_url":"https://arxiv.org/pdf/2503.03382v1.pdf","comment":"9 pages, Accepted at AISTATS 2025"},{"id":"http://arxiv.org/abs/2411.15692v2","updated":"2025-03-05T10:54:30Z","published":"2024-11-24T03:06:59Z","title":"DrugAgent: Automating AI-aided Drug Discovery Programming through LLM\n Multi-Agent Collaboration","summary":" Recent progress in Large Language Models (LLMs) has drawn attention to their\npotential for accelerating drug discovery. However, a central problem remains:\ntranslating theoretical ideas into robust implementations in the highly\nspecialized context of pharmaceutical research. This limitation prevents\npractitioners from making full use of the latest AI developments in drug\ndiscovery. To address this challenge, we introduce DrugAgent, a multi-agent\nframework that automates machine learning (ML) programming for drug discovery\ntasks. DrugAgent employs an LLM Planner that formulates high-level ideas and an\nLLM Instructor that identifies and integrates domain knowledge when\nimplementing those ideas. We present case studies on three representative drug\ndiscovery tasks. Our results show that DrugAgent consistently outperforms\nleading baselines, including a relative improvement of 4.92% in ROC-AUC\ncompared to ReAct for drug-target interaction (DTI). DrugAgent is publicly\navailable at https://anonymous.4open.science/r/drugagent-5C42/.\n","authors":["Sizhe Liu","Yizhou Lu","Siyu Chen","Xiyang Hu","Jieyu Zhao","Yingzhou Lu","Yue Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.15692v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.15425v4","updated":"2025-03-05T10:48:42Z","published":"2025-02-21T12:52:16Z","title":"TAG: A Decentralized Framework for Multi-Agent Hierarchical\n Reinforcement Learning","summary":" Hierarchical organization is fundamental to biological systems and human\nsocieties, yet artificial intelligence systems often rely on monolithic\narchitectures that limit adaptability and scalability. Current hierarchical\nreinforcement learning (HRL) approaches typically restrict hierarchies to two\nlevels or require centralized training, which limits their practical\napplicability. We introduce TAME Agent Framework (TAG), a framework for\nconstructing fully decentralized hierarchical multi-agent systems. TAG enables\nhierarchies of arbitrary depth through a novel LevelEnv concept, which\nabstracts each hierarchy level as the environment for the agents above it. This\napproach standardizes information flow between levels while preserving loose\ncoupling, allowing for seamless integration of diverse agent types. We\ndemonstrate the effectiveness of TAG by implementing hierarchical architectures\nthat combine different RL agents across multiple levels, achieving improved\nperformance over classical multi-agent RL baselines on standard benchmarks. Our\nresults show that decentralized hierarchical organization enhances both\nlearning speed and final performance, positioning TAG as a promising direction\nfor scalable multi-agent systems.\n","authors":["Giuseppe Paolo","Abdelhakim Benechehab","Hamza Cherkaoui","Albert Thomas","Balázs Kégl"],"pdf_url":"https://arxiv.org/pdf/2502.15425v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.03565v2","updated":"2025-03-05T10:47:17Z","published":"2024-10-04T16:15:31Z","title":"Exploration Implies Data Augmentation: Reachability and Generalisation\n in Contextual MDPs","summary":" In the zero-shot policy transfer (ZSPT) setting for contextual Markov\ndecision processes (MDP), agents train on a fixed set of contexts and must\ngeneralise to new ones. Recent work has argued and demonstrated that increased\nexploration can improve this generalisation, by training on more states in the\ntraining contexts. In this paper, we demonstrate that training on more states\ncan indeed improve generalisation, but can come at a cost of reducing the\naccuracy of the learned value function which should not benefit generalisation.\nWe introduce reachability in the ZSPT setting to define which states/contexts\nrequire generalisation and explain why exploration can improve it. We\nhypothesise and demonstrate that using exploration to increase the agent's\ncoverage while also increasing the accuracy improves generalisation even more.\nInspired by this, we propose a method Explore-Go that implements an exploration\nphase at the beginning of each episode, which can be combined with existing on-\nand off-policy RL algorithms and significantly improves generalisation even in\npartially observable MDPs. We demonstrate the effectiveness of Explore-Go when\ncombined with several popular algorithms and show an increase in generalisation\nperformance across several environments. With this, we hope to provide\npractitioners with a simple modification that can improve the generalisation of\ntheir agents.\n","authors":["Max Weltevrede","Caroline Horsch","Matthijs T. J. Spaan","Wendelin Böhmer"],"pdf_url":"https://arxiv.org/pdf/2410.03565v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2406.08069"},{"id":"http://arxiv.org/abs/2503.03372v1","updated":"2025-03-05T10:47:06Z","published":"2025-03-05T10:47:06Z","title":"A Novel Multi-Criteria Local Latin Hypercube Refinement System for\n Commutation Angle Improvement in IPMSMs","summary":" The commutation angle is defined as the angle between the fundamental of the\nmotor phase current and the fundamental of the back-EMF. It can be utilised to\nprovide a compensating effect in IPMSMs. This is due to the reluctance torque\ncomponent being dependent on the commutation angle of the phase current even\nbefore entering the extended speed range. A real-time maximum torque per\ncurrent and voltage strategy is demonstrated to find the trajectory and optimum\ncommutation angles, gamma, where the level of accuracy depends on the\napplication and available computational speed. A magnet volume reduction using\na novel multi-criteria local Latin hypercube refinement (MLHR) sampling system\nis also presented to improve the optimisation process. The proposed new\ntechnique minimises the magnet mass to motor torque density whilst maintaining\na similar phase current level. A mapping of gamma allows the determination of\nthe optimum angles, as shown in this paper. The 3rd generation Toyota Prius\nIPMSM is considered as the reference motor, where the rotor configuration is\naltered to allow for an individual assessment.\n","authors":["Pedram Asef","Mouloud Denai","Johannes J. H. Paulides","Bruno Ricardo Marques","Andrew Lapthorn"],"pdf_url":"https://arxiv.org/pdf/2503.03372v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02772v2","updated":"2025-03-05T10:42:53Z","published":"2024-09-04T14:51:36Z","title":"Unifying Causal Representation Learning with the Invariance Principle","summary":" Causal representation learning (CRL) aims at recovering latent causal\nvariables from high-dimensional observations to solve causal downstream tasks,\nsuch as predicting the effect of new interventions or more robust\nclassification. A plethora of methods have been developed, each tackling\ncarefully crafted problem settings that lead to different types of\nidentifiability. These different settings are widely assumed to be important\nbecause they are often linked to different rungs of Pearl's causal hierarchy,\neven though this correspondence is not always exact. This work shows that\ninstead of strictly conforming to this hierarchical mapping, many causal\nrepresentation learning approaches methodologically align their representations\nwith inherent data symmetries. Identification of causal variables is guided by\ninvariance principles that are not necessarily causal. This result allows us to\nunify many existing approaches in a single method that can mix and match\ndifferent assumptions, including non-causal ones, based on the invariance\nrelevant to the problem at hand. It also significantly benefits applicability,\nwhich we demonstrate by improving treatment effect estimation on real-world\nhigh-dimensional ecological data. Overall, this paper clarifies the role of\ncausal assumptions in the discovery of causal variables and shifts the focus to\npreserving data symmetries.\n","authors":["Dingling Yao","Dario Rancati","Riccardo Cadei","Marco Fumero","Francesco Locatello"],"pdf_url":"https://arxiv.org/pdf/2409.02772v2.pdf","comment":"ICLR2025 Camera ready"},{"id":"http://arxiv.org/abs/2503.03360v1","updated":"2025-03-05T10:40:09Z","published":"2025-03-05T10:40:09Z","title":"Transformers for molecular property prediction: Domain adaptation\n efficiently improves performance","summary":" Most of the current transformer-based chemical language models are\npre-trained on millions to billions of molecules. However, the improvement from\nsuch scaling in dataset size is not confidently linked to improved molecular\nproperty prediction. The aim of this study is to investigate and overcome some\nof the limitations of transformer models in predicting molecular properties.\nSpecifically, we examine the impact of pre-training dataset size and diversity\non the performance of transformer models and investigate the use of domain\nadaptation as a technique for improving model performance. First, our findings\nindicate that increasing pretraining dataset size beyond 400K molecules from\nthe GuacaMol dataset does not result in a significant improvement on four ADME\nendpoints, namely, solubility, permeability, microsomal stability, and plasma\nprotein binding. Second, our results demonstrate that using domain adaptation\nby further training the transformer model on a small set of domain-relevant\nmolecules, i.e., a few hundred to a few thousand, using multi-task regression\nof physicochemical properties was sufficient to significantly improve\nperformance for three out of the four investigated ADME endpoints (P-value <\n0.001). Finally, we observe that a model pre-trained on 400K molecules and\ndomain adopted on a few hundred/thousand molecules performs similarly (P-value\n> 0.05) to more complicated transformer models like MolBERT(pre-trained on 1.3M\nmolecules) and MolFormer (pre-trained on 100M molecules). A comparison to a\nrandom forest model trained on basic physicochemical properties showed similar\nperformance to the examined transformer models. We believe that current\ntransformer models can be improved through further systematic analysis of\npre-training and downstream data, pre-training objectives, and scaling laws,\nultimately leading to better and more helpful models.\n","authors":["Afnan Sultan","Max Rausch-Dupont","Shahrukh Khan","Olga Kalinina","Andrea Volkamer","Dietrich Klakow"],"pdf_url":"https://arxiv.org/pdf/2503.03360v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03355v1","updated":"2025-03-05T10:37:51Z","published":"2025-03-05T10:37:51Z","title":"Video Super-Resolution: All You Need is a Video Diffusion Model","summary":" We present a generic video super-resolution algorithm in this paper, based on\nthe Diffusion Posterior Sampling framework with an unconditional video\ngeneration model in latent space. The video generation model, a diffusion\ntransformer, functions as a space-time model. We argue that a powerful model,\nwhich learns the physics of the real world, can easily handle various kinds of\nmotion patterns as prior knowledge, thus eliminating the need for explicit\nestimation of optical flows or motion parameters for pixel alignment.\nFurthermore, a single instance of the proposed video diffusion transformer\nmodel can adapt to different sampling conditions without re-training. Due to\nlimited computational resources and training data, our experiments provide\nempirical evidence of the algorithm's strong super-resolution capabilities\nusing synthetic data.\n","authors":["Zhihao Zhan","Wang Pang","Xiang Zhu","Yechao Bai"],"pdf_url":"https://arxiv.org/pdf/2503.03355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18222v2","updated":"2025-03-05T10:17:25Z","published":"2024-05-28T14:30:07Z","title":"From Learning to Optimize to Learning Optimization Algorithms","summary":" Towards designing learned optimization algorithms that are usable beyond\ntheir training setting, we identify key principles that classical algorithms\nobey, but have up to now, not been used for Learning to Optimize (L2O).\nFollowing these principles, we provide a general design pipeline, taking into\naccount data, architecture and learning strategy, and thereby enabling a\nsynergy between classical optimization and L2O, resulting in a philosophy of\nLearning Optimization Algorithms. As a consequence our learned algorithms\nperform well far beyond problems from the training distribution. We demonstrate\nthe success of these novel principles by designing a new learning-enhanced BFGS\nalgorithm and provide numerical experiments evidencing its adaptation to many\nsettings at test time.\n","authors":["Camille Castera","Peter Ochs"],"pdf_url":"https://arxiv.org/pdf/2405.18222v2.pdf","comment":"To appear at AISTATS 2025"},{"id":"http://arxiv.org/abs/2405.19036v2","updated":"2025-03-05T10:15:19Z","published":"2024-05-29T12:23:48Z","title":"State Space Models are Provably Comparable to Transformers in Dynamic\n Token Selection","summary":" Deep neural networks based on state space models (SSMs) are attracting\nsignificant attention in sequence modeling since their computational cost is\nmuch smaller than that of Transformers. While the capabilities of SSMs have\nbeen demonstrated through experiments in various tasks, theoretical\nunderstanding of SSMs is still limited. In particular, most theoretical studies\ndiscuss the capabilities of SSM layers without nonlinear layers, and there is a\nlack of discussion on their combination with nonlinear layers. In this paper,\nwe explore the capabilities of SSMs combined with fully connected neural\nnetworks, and show that they are comparable to Transformers in extracting the\nessential tokens depending on the input. As concrete examples, we consider two\nsynthetic tasks, which are challenging for a single SSM layer, and demonstrate\nthat SSMs combined with nonlinear layers can efficiently solve these tasks.\nFurthermore, we study the nonparametric regression task, and prove that the\nability of SSMs is equivalent to that of Transformers in estimating functions\nbelonging to a certain class.\n","authors":["Naoki Nishikawa","Taiji Suzuki"],"pdf_url":"https://arxiv.org/pdf/2405.19036v2.pdf","comment":"43 pages, 7 figures"},{"id":"http://arxiv.org/abs/2407.05704v2","updated":"2025-03-05T10:07:22Z","published":"2024-07-08T08:06:45Z","title":"Narrowing the Gap between Adversarial and Stochastic MDPs via Policy\n Optimization","summary":" We consider the problem of learning in adversarial Markov decision processes\n[MDPs] with an oblivious adversary in a full-information setting. The agent\ninteracts with an environment during $T$ episodes, each of which consists of\n$H$ stages, and each episode is evaluated with respect to a reward function\nthat will be revealed only at the end of the episode. We propose an algorithm,\ncalled APO-MVP, that achieves a regret bound of order\n$\\tilde{\\mathcal{O}}(\\mathrm{poly}(H)\\sqrt{SAT})$, where $S$ and $A$ are sizes\nof the state and action spaces, respectively. This result improves upon the\nbest-known regret bound by a factor of $\\sqrt{S}$, bridging the gap between\nadversarial and stochastic MDPs, and matching the minimax lower bound\n$\\Omega(\\sqrt{H^3SAT})$ as far as the dependencies in $S,A,T$ are concerned.\nThe proposed algorithm and analysis completely avoid the typical tool given by\noccupancy measures; instead, it performs policy optimization based only on\ndynamic programming and on a black-box online linear optimization strategy run\nover estimated advantage functions, making it easy to implement. The analysis\nleverages two recent techniques: policy optimization based on online linear\noptimization strategies (Jonckheere et al., 2023) and a refined martingale\nanalysis of the impact on values of estimating transitions kernels (Zhang et\nal., 2023).\n","authors":["Daniil Tiapkin","Evgenii Chzhen","Gilles Stoltz"],"pdf_url":"https://arxiv.org/pdf/2407.05704v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03331v1","updated":"2025-03-05T10:03:59Z","published":"2025-03-05T10:03:59Z","title":"Leap: Inductive Link Prediction via Learnable TopologyAugmentation","summary":" Link prediction is a crucial task in many downstream applications of graph\nmachine learning. To this end, Graph Neural Network (GNN) is a widely used\ntechnique for link prediction, mainly in transductive settings, where the goal\nis to predict missing links between existing nodes. However, many real-life\napplications require an inductive setting that accommodates for new nodes,\ncoming into an existing graph. Thus, recently inductive link prediction has\nattracted considerable attention, and a multi-layer perceptron (MLP) is the\npopular choice of most studies to learn node representations. However, these\napproaches have limited expressivity and do not fully capture the graph's\nstructural signal. Therefore, in this work we propose LEAP, an inductive link\nprediction method based on LEArnable toPology augmentation. Unlike previous\nmethods, LEAP models the inductive bias from both the structure and node\nfeatures, and hence is more expressive. To the best of our knowledge, this is\nthe first attempt to provide structural contexts for new nodes via learnable\naugmentation in inductive settings. Extensive experiments on seven real-world\nhomogeneous and heterogeneous graphs demonstrates that LEAP significantly\nsurpasses SOTA methods. The improvements are up to 22\\% and 17\\% in terms of\nAUC and average precision, respectively. The code and datasets are available on\nGitHub (https://github.com/AhmedESamy/LEAP/)\n","authors":["Ahmed E. Samy","Zekarias T. Kefato","Sarunas Girdzijauskas"],"pdf_url":"https://arxiv.org/pdf/2503.03331v1.pdf","comment":"published in Machine Learning, Optimization, and Data Science,\n Springer Nature Switzerland"},{"id":"http://arxiv.org/abs/2412.00497v2","updated":"2025-03-05T09:59:23Z","published":"2024-11-30T14:43:00Z","title":"Distributed Differentially Private Data Analytics via Secure Sketching","summary":" We introduce the linear-transformation model, a distributed model of\ndifferentially private data analysis. Clients have access to a trusted platform\ncapable of applying a public matrix to their inputs. Such computations can be\nsecurely distributed across multiple servers using simple and efficient secure\nmultiparty computation techniques.\n The linear-transformation model serves as an intermediate model between the\nhighly expressive central model and the minimal local model. In the central\nmodel, clients have access to a trusted platform capable of applying any\nfunction to their inputs. However, this expressiveness comes at a cost, as it\nis often prohibitively expensive to distribute such computations, leading to\nthe central model typically being implemented by a single trusted server. In\ncontrast, the local model assumes no trusted platform, which forces clients to\nadd significant noise to their data. The linear-transformation model avoids the\nsingle point of failure for privacy present in the central model, while also\nmitigating the high noise required in the local model.\n We demonstrate that linear transformations are very useful for differential\nprivacy, allowing for the computation of linear sketches of input data. These\nsketches largely preserve utility for tasks such as private low-rank\napproximation and private ridge regression, while introducing only minimal\nerror, critically independent of the number of clients.\n","authors":["Jakob Burkhardt","Hannah Keller","Claudio Orlandi","Chris Schwiegelshohn"],"pdf_url":"https://arxiv.org/pdf/2412.00497v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14425v3","updated":"2025-03-05T09:54:52Z","published":"2024-03-21T14:28:43Z","title":"Task-optimal data-driven surrogate models for eNMPC via differentiable\n simulation and optimization","summary":" Mechanistic dynamic process models may be too computationally expensive to be\nusable as part of a real-time capable predictive controller. We present a\nmethod for end-to-end learning of Koopman surrogate models for optimal\nperformance in a specific control task. In contrast to previous contributions\nthat employ standard reinforcement learning (RL) algorithms, we use a training\nalgorithm that exploits the differentiability of environments based on\nmechanistic simulation models to aid the policy optimization. We evaluate the\nperformance of our method by comparing it to that of other training algorithms\non an existing economic nonlinear model predictive control (eNMPC) case study\nof a continuous stirred-tank reactor (CSTR) model. Compared to the benchmark\nmethods, our method produces similar economic performance while eliminating\nconstraint violations. Thus, for this case study, our method outperforms the\nothers and offers a promising path toward more performant controllers that\nemploy dynamic surrogate models.\n","authors":["Daniel Mayfrank","Na Young Ahn","Alexander Mitsos","Manuel Dahmen"],"pdf_url":"https://arxiv.org/pdf/2403.14425v3.pdf","comment":"8 pages, 4 figures, 1 table"},{"id":"http://arxiv.org/abs/2502.07780v3","updated":"2025-03-05T09:50:16Z","published":"2025-02-11T18:59:35Z","title":"DarwinLM: Evolutionary Structured Pruning of Large Language Models","summary":" Large Language Models (LLMs) have achieved significant success across various\nNLP tasks. However, their massive computational costs limit their widespread\nuse, particularly in real-time applications. Structured pruning offers an\neffective solution by compressing models and directly providing end-to-end\nspeed improvements, regardless of the hardware environment. Meanwhile,\ndifferent components of the model exhibit varying sensitivities towards\npruning, calling for non-uniform model compression. However, a pruning method\nshould not only identify a capable substructure, but also account for\npost-compression training. To this end, we propose DarwinLM, a method for\ntraining-aware structured pruning. DarwinLM builds upon an evolutionary search\nprocess, generating multiple offspring models in each generation through\nmutation, and selecting the fittest for survival. To assess the effect of\npost-training, we incorporate a lightweight, multistep training process within\nthe offspring population, progressively increasing the number of tokens and\neliminating poorly performing models in each selection stage. We validate our\nmethod through extensive experiments on Llama-2-7B, Llama-3.1-8B and\nQwen-2.5-14B-Instruct, achieving state-of-the-art performance for structured\npruning. For instance, DarwinLM surpasses ShearedLlama while requiring 5x less\ntraining data during post-compression training. Code is at:\nhttps://github.com/IST-DASLab/DarwinLM\n","authors":["Shengkun Tang","Oliver Sieberling","Eldar Kurtic","Zhiqiang Shen","Dan Alistarh"],"pdf_url":"https://arxiv.org/pdf/2502.07780v3.pdf","comment":"Code: https://github.com/IST-DASLab/DarwinLM"},{"id":"http://arxiv.org/abs/2503.03313v1","updated":"2025-03-05T09:45:22Z","published":"2025-03-05T09:45:22Z","title":"LLM as GNN: Graph Vocabulary Learning for Text-Attributed Graph\n Foundation Models","summary":" Text-Attributed Graphs (TAGs), where each node is associated with text\ndescriptions, are ubiquitous in real-world scenarios. They typically exhibit\ndistinctive structure and domain-specific knowledge, motivating the development\nof a Graph Foundation Model (GFM) that generalizes across diverse graphs and\ntasks. Despite large efforts to integrate Large Language Models (LLMs) and\nGraph Neural Networks (GNNs) for TAGs, existing approaches suffer from\ndecoupled architectures with two-stage alignment, limiting their synergistic\npotential. Even worse, existing methods assign out-of-vocabulary (OOV) tokens\nto graph nodes, leading to graph-specific semantics, token explosion, and\nincompatibility with task-oriented prompt templates, which hinders cross-graph\nand cross-task transferability. To address these challenges, we propose\nPromptGFM, a versatile GFM for TAGs grounded in graph vocabulary learning.\nPromptGFM comprises two key components: (1) Graph Understanding Module, which\nexplicitly prompts LLMs to replicate the finest GNN workflow within the text\nspace, facilitating seamless GNN-LLM integration and elegant graph-text\nalignment; (2) Graph Inference Module, which establishes a language-based graph\nvocabulary ensuring expressiveness, transferability, and scalability, enabling\nreadable instructions for LLM fine-tuning. Extensive experiments demonstrate\nour superiority and transferability across diverse graphs and tasks. The code\nis available at this: https://github.com/agiresearch/PromptGFM.\n","authors":["Xi Zhu","Haochen Xue","Ziwei Zhao","Wujiang Xu","Jingyuan Huang","Minghao Guo","Qifan Wang","Kaixiong Zhou","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.03313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03302v1","updated":"2025-03-05T09:36:57Z","published":"2025-03-05T09:36:57Z","title":"Differential Machine Learning for Time Series Prediction","summary":" Accurate time series prediction is challenging due to the inherent\nnonlinearity and sensitivity to initial conditions. We propose a novel approach\nthat enhances neural network predictions through differential learning, which\ninvolves training models on both the original time series and its differential\nseries. Specifically, we develop a differential long short-term memory\n(Diff-LSTM) network that uses a shared LSTM cell to simultaneously process both\ndata streams, effectively capturing intrinsic patterns and temporal dynamics.\nEvaluated on the Mackey-Glass, Lorenz, and R\\\"ossler chaotic time series, as\nwell as a real-world financial dataset from ACI Worldwide Inc., our results\ndemonstrate that the Diff- LSTM network outperforms prevalent models such as\nrecurrent neural networks, convolutional neural networks, and bidirectional and\nencoder-decoder LSTM networks in both short-term and long-term predictions.\nThis framework offers a promising solution for enhancing time series\nprediction, even when comprehensive knowledge of the underlying dynamics of the\ntime series is not fully available.\n","authors":["Akash Yadav","Eulalia Nualart"],"pdf_url":"https://arxiv.org/pdf/2503.03302v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.09126v3","updated":"2025-03-05T09:30:22Z","published":"2022-10-17T14:19:52Z","title":"Verifiable and Provably Secure Machine Unlearning","summary":" Machine unlearning aims to remove points from the training dataset of a\nmachine learning model after training: e.g., when a user requests their data to\nbe deleted. While many unlearning methods have been proposed, none of them\nenable users to audit the procedure. Furthermore, recent work shows a user is\nunable to verify whether their data was unlearnt from an inspection of the\nmodel parameter alone. Rather than reasoning about parameters, we propose to\nview verifiable unlearning as a security problem. To this end, we present the\nfirst cryptographic definition of verifiable unlearning to formally capture the\nguarantees of an unlearning system. In this framework, the server first\ncomputes a proof that the model was trained on a dataset D. Given a user's data\npoint d requested to be deleted, the server updates the model using an\nunlearning algorithm. It then provides a proof of the correct execution of\nunlearning and that d is not part of D', where D' is the new training dataset\n(i.e., d has been removed). Our framework is generally applicable to different\nunlearning techniques that we abstract as admissible functions. We instantiate\na protocol in the framework, based on cryptographic assumptions, using SNARKs\nand hash chains. Finally, we implement the protocol for three different\nunlearning techniques and validate its feasibility for linear regression,\nlogistic regression, and neural networks.\n","authors":["Thorsten Eisenhofer","Doreen Riepel","Varun Chandrasekaran","Esha Ghosh","Olga Ohrimenko","Nicolas Papernot"],"pdf_url":"https://arxiv.org/pdf/2210.09126v3.pdf","comment":"Accepted at IEEE SaTML2025"},{"id":"http://arxiv.org/abs/2501.18945v2","updated":"2025-03-05T09:13:02Z","published":"2025-01-31T08:08:32Z","title":"Solving Inverse Problem for Multi-armed Bandits via Convex Optimization","summary":" We consider the inverse problem of multi-armed bandits (IMAB) that are widely\nused in neuroscience and psychology research for behavior modelling. We first\nshow that the IMAB problem is not convex in general, but can be relaxed to a\nconvex problem via variable transformation. Based on this result, we propose a\ntwo-step sequential heuristic for (approximately) solving the IMAB problem. We\ndiscuss a condition where our method provides global solution to the IMAB\nproblem with certificate, as well as approximations to further save computing\ntime. Numerical experiments indicate that our heuristic method is more robust\nthan directly solving the IMAB problem via repeated local optimization, and can\nachieve the performance of Monte Carlo methods within a significantly decreased\nrunning time. We provide the implementation of our method based on CVXPY, which\nallows straightforward application by users not well versed in convex\noptimization.\n","authors":["Hao Zhu","Joschka Boedecker"],"pdf_url":"https://arxiv.org/pdf/2501.18945v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.02368v2","updated":"2025-03-05T09:12:25Z","published":"2025-03-04T07:49:10Z","title":"Iterative Value Function Optimization for Guided Decoding","summary":" While Reinforcement Learning from Human Feedback (RLHF) has become the\npredominant method for controlling language model outputs, it suffers from high\ncomputational costs and training instability. Guided decoding, especially\nvalue-guided methods, offers a cost-effective alternative by controlling\noutputs without re-training models. However, the accuracy of the value function\nis crucial for value-guided decoding, as inaccuracies can lead to suboptimal\ndecision-making and degraded performance. Existing methods struggle with\naccurately estimating the optimal value function, leading to less effective\ncontrol. We propose Iterative Value Function Optimization, a novel framework\nthat addresses these limitations through two key components: Monte Carlo Value\nEstimation, which reduces estimation variance by exploring diverse\ntrajectories, and Iterative On-Policy Optimization, which progressively\nimproves value estimation through collecting trajectories from value-guided\npolicies. Extensive experiments on text summarization, multi-turn dialogue, and\ninstruction following demonstrate the effectiveness of value-guided decoding\napproaches in aligning language models. These approaches not only achieve\nalignment but also significantly reduce computational costs by leveraging\nprincipled value function optimization for efficient and effective control.\n","authors":["Zhenhua Liu","Lijun Li","Ruizhe Chen","Yuxian Jiang","Tong Zhu","Zhaochen Su","Wenliang Chen","Jing Shao"],"pdf_url":"https://arxiv.org/pdf/2503.02368v2.pdf","comment":"20 pages, 10 figures"},{"id":"http://arxiv.org/abs/2503.03285v1","updated":"2025-03-05T09:12:16Z","published":"2025-03-05T09:12:16Z","title":"Enhancing Vietnamese VQA through Curriculum Learning on Raw and\n Augmented Text Representations","summary":" Visual Question Answering (VQA) is a multimodal task requiring reasoning\nacross textual and visual inputs, which becomes particularly challenging in\nlow-resource languages like Vietnamese due to linguistic variability and the\nlack of high-quality datasets. Traditional methods often rely heavily on\nextensive annotated datasets, computationally expensive pipelines, and large\npre-trained models, specifically in the domain of Vietnamese VQA, limiting\ntheir applicability in such scenarios. To address these limitations, we propose\na training framework that combines a paraphrase-based feature augmentation\nmodule with a dynamic curriculum learning strategy. Explicitly, augmented\nsamples are considered \"easy\" while raw samples are regarded as \"hard\". The\nframework then utilizes a mechanism that dynamically adjusts the ratio of easy\nto hard samples during training, progressively modifying the same dataset to\nincrease its difficulty level. By enabling gradual adaptation to task\ncomplexity, this approach helps the Vietnamese VQA model generalize well, thus\nimproving overall performance. Experimental results show consistent\nimprovements on the OpenViVQA dataset and mixed outcomes on the ViVQA dataset,\nhighlighting both the potential and challenges of our approach in advancing VQA\nfor Vietnamese language.\n","authors":["Khoi Anh Nguyen","Linh Yen Vu","Thang Dinh Duong","Thuan Nguyen Duong","Huy Thanh Nguyen","Vinh Quang Dinh"],"pdf_url":"https://arxiv.org/pdf/2503.03285v1.pdf","comment":"10 pages, 3 figures, AAAI-25 Workshop on Document Understanding and\n Intelligence"},{"id":"http://arxiv.org/abs/2503.03283v1","updated":"2025-03-05T09:09:01Z","published":"2025-03-05T09:09:01Z","title":"Exploring specialization and sensitivity of convolutional neural\n networks in the context of simultaneous image augmentations","summary":" Drawing parallels with the way biological networks are studied, we adapt the\ntreatment--control paradigm to explainable artificial intelligence research and\nenrich it through multi-parametric input alterations. In this study, we propose\na framework for investigating the internal inference impacted by input data\naugmentations. The internal changes in network operation are reflected in\nactivation changes measured by variance, which can be decomposed into\ncomponents related to each augmentation, employing Sobol indices and Shapley\nvalues. These quantities enable one to visualize sensitivity to different\nvariables and use them for guided masking of activations. In addition, we\nintroduce a way of single-class sensitivity analysis where the candidates are\nfiltered according to their matching to prediction bias generated by targeted\ndamaging of the activations. Relying on the observed parallels, we assume that\nthe developed framework can potentially be transferred to studying biological\nneural networks in complex environments.\n","authors":["Pavel Kharyuk","Sergey Matveev","Ivan Oseledets"],"pdf_url":"https://arxiv.org/pdf/2503.03283v1.pdf","comment":"26 pages; main text: 5 figures, 4 tables; appendix: 4 sections, 3\n tables; supplementary: 7 files (figures S1-S6: packed as 7z archive, S7:\n single pdf file)"},{"id":"http://arxiv.org/abs/2410.17967v2","updated":"2025-03-05T09:05:23Z","published":"2024-10-23T15:34:11Z","title":"POMDP-Driven Cognitive Massive MIMO Radar: Joint Target\n Detection-Tracking In Unknown Disturbances","summary":" The joint detection and tracking of a moving target embedded in an unknown\ndisturbance represents a key feature that motivates the development of the\ncognitive radar paradigm. Building upon recent advancements in robust target\ndetection with multiple-input multiple-output (MIMO) radars, this work explores\nthe application of a Partially Observable Markov Decision Process (POMDP)\nframework to enhance the tracking and detection tasks in a statistically\nunknown environment. In the POMDP setup, the radar system is considered as an\nintelligent agent that continuously senses the surrounding environment,\noptimizing its actions to maximize the probability of detection $(P_D)$ and\nimprove the target position and velocity estimation, all this while keeping a\nconstant probability of false alarm $(P_{FA})$. The proposed approach employs\nan online algorithm that does not require any apriori knowledge of the noise\nstatistics, and it relies on a much more general observation model than the\ntraditional range-azimuth-elevation model employed by conventional tracking\nalgorithms. Simulation results clearly show substantial performance improvement\nof the POMDP-based algorithm compared to the State-Action-Reward-State-Action\n(SARSA)-based one that has been recently investigated in the context of massive\nMIMO (MMIMO) radar systems.\n","authors":["Imad Bouhou","Stefano Fortunati","Leila Gharsalli","Alexandre Renaux"],"pdf_url":"https://arxiv.org/pdf/2410.17967v2.pdf","comment":"The paper has been submitted to ieee Transactions on radar systems"},{"id":"http://arxiv.org/abs/2503.03276v1","updated":"2025-03-05T08:59:06Z","published":"2025-03-05T08:59:06Z","title":"TrafficKAN-GCN: Graph Convolutional-based Kolmogorov-Arnold Network for\n Traffic Flow Optimization","summary":" Urban traffic optimization is critical for improving transportation\nefficiency and alleviating congestion, particularly in large-scale dynamic\nnetworks. Traditional methods, such as Dijkstra's and Floyd's algorithms,\nprovide effective solutions in static settings, but they struggle with the\nspatial-temporal complexity of real-world traffic flows. In this work, we\npropose TrafficKAN-GCN, a hybrid deep learning framework combining\nKolmogorov-Arnold Networks (KAN) with Graph Convolutional Networks (GCN),\ndesigned to enhance urban traffic flow optimization. By integrating KAN's\nadaptive nonlinear function approximation with GCN's spatial graph learning\ncapabilities, TrafficKAN-GCN captures both complex traffic patterns and\ntopological dependencies. We evaluate the proposed framework using real-world\ntraffic data from the Baltimore Metropolitan area. Compared with baseline\nmodels such as MLP-GCN, standard GCN, and Transformer-based approaches,\nTrafficKAN-GCN achieves competitive prediction accuracy while demonstrating\nimproved robustness in handling noisy and irregular traffic data. Our\nexperiments further highlight the framework's ability to redistribute traffic\nflow, mitigate congestion, and adapt to disruptive events, such as the Francis\nScott Key Bridge collapse. This study contributes to the growing body of work\non hybrid graph learning for intelligent transportation systems, highlighting\nthe potential of combining KAN and GCN for real-time traffic optimization.\nFuture work will focus on reducing computational overhead and integrating\nTransformer-based temporal modeling for enhanced long-term traffic prediction.\nThe proposed TrafficKAN-GCN framework offers a promising direction for\ndata-driven urban mobility management, balancing predictive accuracy,\nrobustness, and computational efficiency.\n","authors":["Jiayi Zhang","Yiming Zhang","Yuan Zheng","Yuchen Wang","Jinjiang You","Yuchen Xu","Wenxing Jiang","Soumyabrata Dev"],"pdf_url":"https://arxiv.org/pdf/2503.03276v1.pdf","comment":"21 pages, 14 figures"},{"id":"http://arxiv.org/abs/2503.03274v1","updated":"2025-03-05T08:56:26Z","published":"2025-03-05T08:56:26Z","title":"Benchmarking Dynamic SLO Compliance in Distributed Computing Continuum\n Systems","summary":" Ensuring Service Level Objectives (SLOs) in large-scale architectures, such\nas Distributed Computing Continuum Systems (DCCS), is challenging due to their\nheterogeneous nature and varying service requirements across different devices\nand applications. Additionally, unpredictable workloads and resource\nlimitations lead to fluctuating performance and violated SLOs. To improve SLO\ncompliance in DCCS, one possibility is to apply machine learning; however, the\ndesign choices are often left to the developer. To that extent, we provide a\nbenchmark of Active Inference -- an emerging method from neuroscience --\nagainst three established reinforcement learning algorithms (Deep Q-Network,\nAdvantage Actor-Critic, and Proximal Policy Optimization). We consider a\nrealistic DCCS use case: an edge device running a video conferencing\napplication alongside a WebSocket server streaming videos. Using one of the\nrespective algorithms, we continuously monitor key performance metrics, such as\nlatency and bandwidth usage, to dynamically adjust parameters -- including the\nnumber of streams, frame rate, and resolution -- to optimize service quality\nand user experience. To test algorithms' adaptability to constant system\nchanges, we simulate dynamically changing SLOs and both instant and gradual\ndata-shift scenarios, such as network bandwidth limitations and fluctuating\ndevice thermal states. Although the evaluated algorithms all showed advantages\nand limitations, our findings demonstrate that Active Inference is a promising\napproach for ensuring SLO compliance in DCCS, offering lower memory usage,\nstable CPU utilization, and fast convergence.\n","authors":["Alfreds Lapkovskis","Boris Sedlak","Sindri Magnússon","Schahram Dustdar","Praveen Kumar Donta"],"pdf_url":"https://arxiv.org/pdf/2503.03274v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03269v1","updated":"2025-03-05T08:50:53Z","published":"2025-03-05T08:50:53Z","title":"Conformal Transformations for Symmetric Power Transformers","summary":" Transformers with linear attention offer significant computational advantages\nover softmax-based transformers but often suffer from degraded performance. The\nsymmetric power (sympow) transformer, a particular type of linear transformer,\naddresses some of this performance gap by leveraging symmetric tensor\nembeddings, achieving comparable performance to softmax transformers. However,\nthe finite capacity of the recurrent state in sympow transformers limits their\nability to retain information, leading to performance degradation when scaling\nthe training or evaluation context length. To address this issue, we propose\nthe conformal-sympow transformer, which dynamically frees up capacity using\ndata-dependent multiplicative gating and adaptively stores information using\ndata-dependent rotary embeddings. Preliminary experiments on the LongCrawl64\ndataset demonstrate that conformal-sympow overcomes the limitations of sympow\ntransformers, achieving robust performance across scaled training and\nevaluation contexts.\n","authors":["Saurabh Kumar","Jacob Buckman","Carles Gelada","Sean Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.03269v1.pdf","comment":"SCOPE Workshop at ICLR 2025"},{"id":"http://arxiv.org/abs/2412.06464v2","updated":"2025-03-05T08:47:27Z","published":"2024-12-09T13:09:04Z","title":"Gated Delta Networks: Improving Mamba2 with Delta Rule","summary":" Linear Transformers have gained attention as efficient alternatives to\nstandard Transformers, but their performance in retrieval and long-context\ntasks has been limited. To address these limitations, recent work has explored\ntwo distinct mechanisms: gating for adaptive memory control and the delta\nupdate rule for precise memory modifications. We observe that these mechanisms\nare complementary: gating enables rapid memory erasure while the delta rule\nfacilitates targeted updates. Building on this insight, we introduce the gated\ndelta rule and develop a parallel training algorithm optimized for modern\nhardware. Our proposed architecture, Gated DeltaNet, consistently surpasses\nexisting models like Mamba2 and DeltaNet across multiple benchmarks, including\nlanguage modeling, common-sense reasoning, in-context retrieval, length\nextrapolation, and long-context understanding. We further enhance performance\nby developing hybrid architectures that combine Gated DeltaNet layers with\nsliding window attention or Mamba2 layers, achieving both improved training\nefficiency and superior task performance.\n","authors":["Songlin Yang","Jan Kautz","Ali Hatamizadeh"],"pdf_url":"https://arxiv.org/pdf/2412.06464v2.pdf","comment":"ICLR 2025 camera ready"},{"id":"http://arxiv.org/abs/2408.07246v3","updated":"2025-03-05T08:43:44Z","published":"2024-08-14T01:16:40Z","title":"ChemVLM: Exploring the Power of Multimodal Large Language Models in\n Chemistry Area","summary":" Large Language Models (LLMs) have achieved remarkable success and have been\napplied across various scientific fields, including chemistry. However, many\nchemical tasks require the processing of visual information, which cannot be\nsuccessfully handled by existing chemical LLMs. This brings a growing need for\nmodels capable of integrating multimodal information in the chemical domain. In\nthis paper, we introduce \\textbf{ChemVLM}, an open-source chemical multimodal\nlarge language model specifically designed for chemical applications. ChemVLM\nis trained on a carefully curated bilingual multimodal dataset that enhances\nits ability to understand both textual and visual chemical information,\nincluding molecular structures, reactions, and chemistry examination questions.\nWe develop three datasets for comprehensive evaluation, tailored to Chemical\nOptical Character Recognition (OCR), Multimodal Chemical Reasoning (MMCR), and\nMultimodal Molecule Understanding tasks. We benchmark ChemVLM against a range\nof open-source and proprietary multimodal large language models on various\ntasks. Experimental results demonstrate that ChemVLM achieves competitive\nperformance across all evaluated tasks. Our model can be found at\nhttps://huggingface.co/AI4Chem/ChemVLM-26B.\n","authors":["Junxian Li","Di Zhang","Xunzhi Wang","Zeying Hao","Jingdi Lei","Qian Tan","Cai Zhou","Wei Liu","Yaotian Yang","Xinrui Xiong","Weiyun Wang","Zhe Chen","Wenhai Wang","Wei Li","Shufei Zhang","Mao Su","Wanli Ouyang","Yuqiang Li","Dongzhan Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.07246v3.pdf","comment":"11 pages, updated version"},{"id":"http://arxiv.org/abs/2502.16232v2","updated":"2025-03-05T08:42:40Z","published":"2025-02-22T14:04:23Z","title":"Flow-based Bayesian filtering for high-dimensional nonlinear stochastic\n dynamical systems","summary":" Bayesian filtering for high-dimensional nonlinear stochastic dynamical\nsystems is a fundamental yet challenging problem in many fields of science and\nengineering. Existing methods face significant obstacles: Gaussian-based\nfilters struggle with non-Gaussian distributions, while sequential Monte Carlo\nmethods are computationally intensive and prone to particle degeneracy in high\ndimensions. Although generative models in machine learning have made\nsignificant progress in modeling high-dimensional non-Gaussian distributions,\ntheir inefficiency in online updating limits their applicability to filtering\nproblems. To address these challenges, we propose a flow-based Bayesian filter\n(FBF) that integrates normalizing flows to construct a novel latent linear\nstate-space model with Gaussian filtering distributions. This framework\nfacilitates efficient density estimation and sampling using invertible\ntransformations provided by normalizing flows, and it enables the construction\nof filters in a data-driven manner, without requiring prior knowledge of system\ndynamics or observation models. Numerical experiments demonstrate the superior\naccuracy and efficiency of FBF.\n","authors":["Xintong Wang","Xiaofei Guan","Ling Guo","Hao Wu"],"pdf_url":"https://arxiv.org/pdf/2502.16232v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03262v1","updated":"2025-03-05T08:38:51Z","published":"2025-03-05T08:38:51Z","title":"Trajectory Prediction for Autonomous Driving: Progress, Limitations, and\n Future Directions","summary":" As the potential for autonomous vehicles to be integrated on a large scale\ninto modern traffic systems continues to grow, ensuring safe navigation in\ndynamic environments is crucial for smooth integration. To guarantee safety and\nprevent collisions, autonomous vehicles must be capable of accurately\npredicting the trajectories of surrounding traffic agents. Over the past\ndecade, significant efforts from both academia and industry have been dedicated\nto designing solutions for precise trajectory forecasting. These efforts have\nproduced a diverse range of approaches, raising questions about the differences\nbetween these methods and whether trajectory prediction challenges have been\nfully addressed. This paper reviews a substantial portion of recent trajectory\nprediction methods and devises a taxonomy to classify existing solutions. A\ngeneral overview of the prediction pipeline is also provided, covering input\nand output modalities, modeling features, and prediction paradigms discussed in\nthe literature. In addition, the paper discusses active research areas within\ntrajectory prediction, addresses the posed research questions, and highlights\nthe remaining research gaps and challenges.\n","authors":["Nadya Abdel Madjid","Abdulrahman Ahmad","Murad Mebrahtu","Yousef Babaa","Abdelmoamen Nasser","Sumbal Malik","Bilal Hassan","Naoufel Werghi","Jorge Dias","Majid Khonji"],"pdf_url":"https://arxiv.org/pdf/2503.03262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04910v3","updated":"2025-03-05T08:37:17Z","published":"2024-12-06T10:05:10Z","title":"Learning High-Degree Parities: The Crucial Role of the Initialization","summary":" Parities have become a standard benchmark for evaluating learning algorithms.\nRecent works show that regular neural networks trained by gradient descent can\nefficiently learn degree $k$ parities on uniform inputs for constant $k$, but\nfail to do so when $k$ and $d-k$ grow with $d$ (here $d$ is the ambient\ndimension). However, the case where $k=d-O_d(1)$ (almost-full parities),\nincluding the degree $d$ parity (the full parity), has remained unsettled. This\npaper shows that for gradient descent on regular neural networks, learnability\ndepends on the initial weight distribution. On one hand, the discrete\nRademacher initialization enables efficient learning of almost-full parities,\nwhile on the other hand, its Gaussian perturbation with large enough constant\nstandard deviation $\\sigma$ prevents it. The positive result for almost-full\nparities is shown to hold up to $\\sigma=O(d^{-1})$, pointing to questions about\na sharper threshold phenomenon. Unlike statistical query (SQ) learning, where a\nsingleton function class like the full parity is trivially learnable, our\nnegative result applies to a fixed function and relies on an initial gradient\nalignment measure of potential broader relevance to neural networks learning.\n","authors":["Emmanuel Abbe","Elisabetta Cornacchia","Jan Hązła","Donald Kougang-Yombi"],"pdf_url":"https://arxiv.org/pdf/2412.04910v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06927v3","updated":"2025-03-05T08:35:41Z","published":"2024-08-13T14:29:00Z","title":"Breaking Class Barriers: Efficient Dataset Distillation via Inter-Class\n Feature Compensator","summary":" Dataset distillation has emerged as a technique aiming to condense\ninformative features from large, natural datasets into a compact and synthetic\nform. While recent advancements have refined this technique, its performance is\nbottlenecked by the prevailing class-specific synthesis paradigm. Under this\nparadigm, synthetic data is optimized exclusively for a pre-assigned one-hot\nlabel, creating an implicit class barrier in feature condensation. This leads\nto inefficient utilization of the distillation budget and oversight of\ninter-class feature distributions, which ultimately limits the effectiveness\nand efficiency, as demonstrated in our analysis. To overcome these constraints,\nthis paper presents the Inter-class Feature Compensator (INFER), an innovative\ndistillation approach that transcends the class-specific data-label framework\nwidely utilized in current dataset distillation methods. Specifically, INFER\nleverages a Universal Feature Compensator (UFC) to enhance feature integration\nacross classes, enabling the generation of multiple additional synthetic\ninstances from a single UFC input. This significantly improves the efficiency\nof the distillation budget. Moreover, INFER enriches inter-class interactions\nduring the distillation, thereby enhancing the effectiveness and\ngeneralizability of the distilled data. By allowing for the linear\ninterpolation of labels similar to those in the original dataset, INFER\nmeticulously optimizes the synthetic data and dramatically reduces the size of\nsoft labels in the synthetic dataset to almost zero, establishing a new\nbenchmark for efficiency and effectiveness in dataset distillation. In\npractice, INFER demonstrates state-of-the-art performance across benchmark\ndatasets. For instance, in the ipc = 50 setting on ImageNet-1k with the same\ncompression level, it outperforms SRe2L by 34.5% using ResNet18.\n","authors":["Xin Zhang","Jiawei Du","Ping Liu","Joey Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.06927v3.pdf","comment":"Accepted to ICLR 2025"},{"id":"http://arxiv.org/abs/2305.15759v6","updated":"2025-03-05T08:34:25Z","published":"2023-05-25T06:18:31Z","title":"DP-LDMs: Differentially Private Latent Diffusion Models","summary":" Diffusion models (DMs) are one of the most widely used generative models for\nproducing high quality images. However, a flurry of recent papers points out\nthat DMs are least private forms of image generators, by extracting a\nsignificant number of near-identical replicas of training images from DMs.\nExisting privacy-enhancing techniques for DMs, unfortunately, do not provide a\ngood privacy-utility tradeoff. In this paper, we aim to improve the current\nstate of DMs with differential privacy (DP) by adopting the $\\textit{Latent}$\nDiffusion Models (LDMs). LDMs are equipped with powerful pre-trained\nautoencoders that map the high-dimensional pixels into lower-dimensional latent\nrepresentations, in which DMs are trained, yielding a more efficient and fast\ntraining of DMs. Rather than fine-tuning the entire LDMs, we fine-tune only the\n$\\textit{attention}$ modules of LDMs with DP-SGD, reducing the number of\ntrainable parameters by roughly $90\\%$ and achieving a better privacy-accuracy\ntrade-off. Our approach allows us to generate realistic, high-dimensional\nimages (256x256) conditioned on text prompts with DP guarantees, which, to the\nbest of our knowledge, has not been attempted before. Our approach provides a\npromising direction for training more powerful, yet training-efficient\ndifferentially private DMs, producing high-quality DP images. Our code is\navailable at https://anonymous.4open.science/r/DP-LDM-4525.\n","authors":["Michael F. Liu","Saiyue Lyu","Margarita Vinaroz","Mijung Park"],"pdf_url":"https://arxiv.org/pdf/2305.15759v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03258v1","updated":"2025-03-05T08:28:11Z","published":"2025-03-05T08:28:11Z","title":"Exploring the Potential of Large Language Models as Predictors in\n Dynamic Text-Attributed Graphs","summary":" With the rise of large language models (LLMs), there has been growing\ninterest in Graph Foundation Models (GFMs) for graph-based tasks. By leveraging\nLLMs as predictors, GFMs have demonstrated impressive generalizability across\nvarious tasks and datasets. However, existing research on LLMs as predictors\nhas predominantly focused on static graphs, leaving their potential in dynamic\ngraph prediction unexplored. In this work, we pioneer using LLMs for predictive\ntasks on dynamic graphs. We identify two key challenges: the constraints\nimposed by context length when processing large-scale historical data and the\nsignificant variability in domain characteristics, both of which complicate the\ndevelopment of a unified predictor. To address these challenges, we propose the\nGraphAgent-Dynamic (GAD) Framework, a multi-agent system that leverages\ncollaborative LLMs. In contrast to using a single LLM as the predictor, GAD\nincorporates global and local summary agents to generate domain-specific\nknowledge, enhancing its transferability across domains. Additionally,\nknowledge reflection agents enable adaptive updates to GAD's knowledge,\nmaintaining a unified and self-consistent architecture. In experiments, GAD\ndemonstrates performance comparable to or even exceeds that of full-supervised\ngraph neural networks without dataset-specific training. Finally, to enhance\nthe task-specific performance of LLM-based predictors, we discuss potential\nimprovements, such as dataset-specific fine-tuning to LLMs. By developing\ntailored strategies for different tasks, we provide new insights for the future\ndesign of LLM-based predictors.\n","authors":["Runlin Lei","Jiarui Ji","Haipeng Ding","Lu Yi","Zhewei Wei","Yongchao Liu","Chuntao Hong"],"pdf_url":"https://arxiv.org/pdf/2503.03258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01669v2","updated":"2025-03-05T08:23:02Z","published":"2024-01-16T13:41:00Z","title":"Improved Performances and Motivation in Intelligent Tutoring Systems:\n Combining Machine Learning and Learner Choice","summary":" Large class sizes challenge personalized learning in schools, prompting the\nuse of educational technologies such as intelligent tutoring systems. To\naddress this, we present an AI-driven personalization system, called ZPDES,\nbased on the Learning Progress Hypothesis - modeling curiosity-driven learning\n- and multi-armed bandit techniques. It sequences exercises that maximize\nlearning progress for each student. While previous studies demonstrated its\nefficacy in enhancing learning compared to hand-made curricula, its impact on\nstudent motivation remained unexplored. Furthermore, ZPDES previously lacked\nfeatures allowing student choice, a limitation in agency that conflicts with\nits foundation on models of curiosity-driven learning. This study investigates\nhow integrating choice, as a gamification element unrelated to exercise\ndifficulty, affects both learning outcomes and motivation. We conducted an\nextensive field study (265 7-8 years old children, RCT design), comparing ZPDES\nwith and without choice against a hand-designed curriculum. Results show that\nZPDES improves both learning performance and the learning experience. Moreover\nadding choice to ZPDES enhances intrinsic motivation and further strengthens\nits learning benefits. In contrast, incorporating choice into a fixed, linear\ncurriculum negatively impacts learning outcomes. These findings highlight that\nthe intrinsic motivation elicited by choice (gamification) is beneficial only\nwhen paired with an adaptive personalized learning system. This insight is\ncritical as gamified features become increasingly prevalent in educational\ntechnologies.\n","authors":["Benjamin Clément","Hélène Sauzéon","Didier Roy","Pierre-Yves Oudeyer"],"pdf_url":"https://arxiv.org/pdf/2402.01669v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.01297v2","updated":"2025-03-05T08:03:39Z","published":"2025-03-03T08:33:35Z","title":"Regularization-based Framework for Quantization-, Fault- and\n Variability-Aware Training","summary":" Efficient inference is critical for deploying deep learning models on edge AI\ndevices. Low-bit quantization (e.g., 3- and 4-bit) with fixed-point arithmetic\nimproves efficiency, while low-power memory technologies like analog\nnonvolatile memory enable further gains. However, these methods introduce\nnon-ideal hardware behavior, including bit faults and device-to-device\nvariability. We propose a regularization-based quantization-aware training\n(QAT) framework that supports fixed, learnable step-size, and learnable\nnon-uniform quantization, achieving competitive results on CIFAR-10 and\nImageNet. Our method also extends to Spiking Neural Networks (SNNs),\ndemonstrating strong performance on 4-bit networks on CIFAR10-DVS and N-Caltech\n101. Beyond quantization, our framework enables fault and variability-aware\nfine-tuning, mitigating stuck-at faults (fixed weight bits) and device\nresistance variability. Compared to prior fault-aware training, our approach\nsignificantly improves performance recovery under upto 20% bit-fault rate and\n40% device-to-device variability. Our results establish a generalizable\nframework for quantization and robustness-aware training, enhancing efficiency\nand reliability in low-power, non-ideal hardware.\n","authors":["Anmol Biswas","Raghav Singhal","Sivakumar Elangovan","Shreyas Sabnis","Udayan Ganguly"],"pdf_url":"https://arxiv.org/pdf/2503.01297v2.pdf","comment":"AB and RS contributed equally to this work. A version of this paper\n accepted at MLNCP @ NeuRIPS '24"},{"id":"http://arxiv.org/abs/2503.03245v1","updated":"2025-03-05T07:53:39Z","published":"2025-03-05T07:53:39Z","title":"Less is more? Rewards in RL for Cyber Defence","summary":" The last few years has seen an explosion of interest in autonomous cyber\ndefence agents based on deep reinforcement learning. Such agents are typically\ntrained in a cyber gym environment, also known as a cyber simulator, at least\n32 of which have already been built. Most, if not all cyber gyms provide dense\n\"scaffolded\" reward functions which combine many penalties or incentives for a\nrange of (un)desirable states and costly actions. Whilst dense rewards help\nalleviate the challenge of exploring complex environments, yielding seemingly\neffective strategies from relatively few environment steps; they are also known\nto bias the solutions an agent can find, potentially towards suboptimal\nsolutions. Sparse rewards could offer preferable or more effective solutions\nand have been overlooked by cyber gyms to date. In this work we set out to\nevaluate whether sparse reward functions might enable training more effective\ncyber defence agents. Towards this goal we first break down several evaluation\nlimitations in existing work by proposing a ground truth evaluation score that\ngoes beyond the standard RL paradigm used to train and evaluate agents. By\nadapting a well-established cyber gym to accommodate our methodology and ground\ntruth score, we propose and evaluate two sparse reward mechanisms and compare\nthem with a typical dense reward. Our evaluation considers a range of network\nsizes, from 2 to 50 nodes, and both reactive and proactive defensive actions.\nOur results show that sparse rewards, particularly positive reinforcement for\nan uncompromised network state, enable the training of more effective cyber\ndefence agents. Furthermore, we show that sparse rewards provide more stable\ntraining than dense rewards, and that both effectiveness and training stability\nare robust to a variety of cyber environment considerations.\n","authors":["Elizabeth Bates","Chris Hicks","Vasilios Mavroudis"],"pdf_url":"https://arxiv.org/pdf/2503.03245v1.pdf","comment":"4 Pages"},{"id":"http://arxiv.org/abs/2503.03241v1","updated":"2025-03-05T07:47:57Z","published":"2025-03-05T07:47:57Z","title":"Structural Entropy Guided Unsupervised Graph Out-Of-Distribution\n Detection","summary":" With the emerging of huge amount of unlabeled data, unsupervised\nout-of-distribution (OOD) detection is vital for ensuring the reliability of\ngraph neural networks (GNNs) by identifying OOD samples from in-distribution\n(ID) ones during testing, where encountering novel or unknown data is\ninevitable. Existing methods often suffer from compromised performance due to\nredundant information in graph structures, which impairs their ability to\neffectively differentiate between ID and OOD data. To address this challenge,\nwe propose SEGO, an unsupervised framework that integrates structural entropy\ninto OOD detection regarding graph classification. Specifically, within the\narchitecture of contrastive learning, SEGO introduces an anchor view in the\nform of coding tree by minimizing structural entropy. The obtained coding tree\neffectively removes redundant information from graphs while preserving\nessential structural information, enabling the capture of distinct graph\npatterns between ID and OOD samples. Furthermore, we present a multi-grained\ncontrastive learning scheme at local, global, and tree levels using triplet\nviews, where coding trees with essential information serve as the anchor view.\nExtensive experiments on real-world datasets validate the effectiveness of\nSEGO, demonstrating superior performance over state-of-the-art baselines in OOD\ndetection. Specifically, our method achieves the best performance on 9 out of\n10 dataset pairs, with an average improvement of 3.7\\% on OOD detection\ndatasets, significantly surpassing the best competitor by 10.8\\% on the\nFreeSolv/ToxCast dataset pair.\n","authors":["Yue Hou","He Zhu","Ruomei Liu","Yingke Su","Jinxiang Xia","Junran Wu","Ke Xu"],"pdf_url":"https://arxiv.org/pdf/2503.03241v1.pdf","comment":"Accepted by AAAI 2025 (The 39th Annual AAAI Conference on Artificial\n Intelligence)"},{"id":"http://arxiv.org/abs/2503.03239v1","updated":"2025-03-05T07:45:56Z","published":"2025-03-05T07:45:56Z","title":"PAIR: A Novel Large Language Model-Guided Selection Strategy for\n Evolutionary Algorithms","summary":" Evolutionary Algorithms (EAs) employ random or simplistic selection methods,\nlimiting their exploration of solution spaces and convergence to optimal\nsolutions. The randomness in performing crossover or mutations may limit the\nmodel's ability to evolve efficiently. This paper introduces Preference-Aligned\nIndividual Reciprocity (PAIR), a novel selection approach leveraging Large\nLanguage Models to emulate human-like mate selection, thereby introducing\nintelligence to the pairing process in EAs. PAIR prompts an LLM to evaluate\nindividuals within a population based on genetic diversity, fitness level, and\ncrossover compatibility, guiding more informed pairing decisions. We evaluated\nPAIR against a baseline method called LLM-driven EA (LMEA), published recently.\nResults indicate that PAIR significantly outperforms LMEA across various TSP\ninstances, achieving lower optimality gaps and improved convergence. This\nperformance is especially noticeable when combined with the flash thinking\nmodel, demonstrating increased population diversity to escape local optima. In\ngeneral, PAIR provides a new strategy in the area of in-context learning for\nLLM-driven selection in EAs via sophisticated preference modelling, paving the\nway for improved solutions and further studies into LLM-guided optimization.\n","authors":["Shady Ali","Mahmoud Ashraf","Seif Hegazy","Fatty Salem","Hoda Mokhtar","Mohamed Medhat Gaber","Mohamed Taher Alrefaie"],"pdf_url":"https://arxiv.org/pdf/2503.03239v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03237v1","updated":"2025-03-05T07:31:06Z","published":"2025-03-05T07:31:06Z","title":"Prediction of Halo Coronal Mass Ejections Using SDO/HMI Vector Magnetic\n Data Products and a Transformer Model","summary":" We present a transformer model, named DeepHalo, to predict the occurrence of\nhalo coronal mass ejections (CMEs). Our model takes as input an active region\n(AR) and a profile, where the profile contains a time series of data samples in\nthe AR that are collected 24 hours before the beginning of a day, and predicts\nwhether the AR would produce a halo CME during that day. Each data sample\ncontains physical parameters, or features, derived from photospheric vector\nmagnetic field data taken by the Helioseismic and Magnetic Imager (HMI) on\nboard the Solar Dynamics Observatory (SDO). We survey and match CME events in\nthe Space Weather Database Of Notification, Knowledge, Information (DONKI) and\nLarge Angle and Spectrometric Coronagraph (LASCO) CME Catalog, and compile a\nlist of CMEs including halo CMEs and non-halo CMEs associated with ARs in the\nperiod between November 2010 and August 2023. We use the information gathered\nabove to build the labels (positive versus negative) of the data samples and\nprofiles at hand, where the labels are needed for machine learning.\nExperimental results show that DeepHalo with a true skill statistics (TSS)\nscore of 0.907 outperforms a closely related long short-term memory network\nwith a TSS score of 0.821. To our knowledge, this is the first time that the\ntransformer model has been used for halo CME prediction.\n","authors":["Hongyang Zhang","Ju Jing","Jason T. L. Wang","Haimin Wang","Yasser Abduallah","Yan Xu","Khalid A. Alobaid","Hameedullah Farooki","Vasyl Yurchyshyn"],"pdf_url":"https://arxiv.org/pdf/2503.03237v1.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2412.17107v3","updated":"2025-03-05T07:29:42Z","published":"2024-12-22T17:39:32Z","title":"Grams: Gradient Descent with Adaptive Momentum Scaling","summary":" We introduce $\\mathbf{G}$radient Descent with $\\mathbf{A}$daptive\n$\\mathbf{M}$omentum $\\mathbf{S}$caling ($\\mathbf{Grams}$), a novel optimization\nalgorithm that decouples the direction and magnitude of parameter updates in\ndeep learning. Unlike traditional optimizers that directly integrate momentum\ninto updates, Grams separates the update direction, derived from current\ngradients, from momentum, which is used solely for adaptive magnitude scaling.\nThis approach enables Grams to achieve improved loss descent compared to\nstate-of-the-art cautious and momentum-based optimizers. We theoretically\ndemonstrate that Grams descents faster than other state-of-the-art optimizers\nand establish a global convergence guarantee for Grams. We also validate its\neffectiveness through extensive empirical evaluations. The results demonstrate\nGrams' superior performance, including faster convergence and better\ngeneralization, compared to widely-used optimizers such as Adam, Lion, and\ntheir cautious variants. Our results highlight Grams' potential as a\ntransformative approach for efficiently training and fine-tuning large language\nmodels. Code is available at https://github.com/Gunale0926/Grams.\n","authors":["Yang Cao","Xiaoyu Li","Zhao Song"],"pdf_url":"https://arxiv.org/pdf/2412.17107v3.pdf","comment":"SCOPE Workshop @ ICLR 2025"},{"id":"http://arxiv.org/abs/2407.01214v3","updated":"2025-03-05T07:02:28Z","published":"2024-07-01T11:59:59Z","title":"Revisiting Random Walks for Learning on Graphs","summary":" We revisit a simple model class for machine learning on graphs, where a\nrandom walk on a graph produces a machine-readable record, and this record is\nprocessed by a deep neural network to directly make vertex-level or graph-level\npredictions. We call these stochastic machines random walk neural networks\n(RWNNs), and through principled analysis, show that we can design them to be\nisomorphism invariant while capable of universal approximation of graph\nfunctions in probability. A useful finding is that almost any kind of record of\nrandom walks guarantees probabilistic invariance as long as the vertices are\nanonymized. This enables us, for example, to record random walks in plain text\nand adopt a language model to read these text records to solve graph tasks. We\nfurther establish a parallelism to message passing neural networks using tools\nfrom Markov chain theory, and show that over-smoothing in message passing is\nalleviated by construction in RWNNs, while over-squashing manifests as\nprobabilistic under-reaching. We empirically demonstrate RWNNs on a range of\nproblems, verifying our theoretical analysis and demonstrating the use of\nlanguage models for separating strongly regular graphs where 3-WL test fails,\nand transductive classification on arXiv citation network. Code is available at\nhttps://github.com/jw9730/random-walk.\n","authors":["Jinwoo Kim","Olga Zaghen","Ayhan Suleymanzade","Youngmin Ryou","Seunghoon Hong"],"pdf_url":"https://arxiv.org/pdf/2407.01214v3.pdf","comment":"51 pages, 14 figures"},{"id":"http://arxiv.org/abs/2502.17543v2","updated":"2025-03-05T06:53:52Z","published":"2025-02-24T18:56:58Z","title":"Training a Generally Curious Agent","summary":" Efficient exploration is essential for intelligent systems interacting with\ntheir environment, but existing language models often fall short in scenarios\nthat require strategic information gathering. In this paper, we present\nPAPRIKA, a fine-tuning approach that enables language models to develop general\ndecision-making capabilities that are not confined to particular environments.\nBy training on synthetic interaction data from different tasks that require\ndiverse strategies, PAPRIKA teaches models to explore and adapt their behavior\non a new task based on environment feedback in-context without more gradient\nupdates. Experimental results show that models fine-tuned with PAPRIKA can\neffectively transfer their learned decision-making capabilities to entirely\nunseen tasks without additional training. Unlike traditional training, our\napproach's primary bottleneck lies in sampling useful interaction data instead\nof model updates. To improve sample efficiency, we propose a curriculum\nlearning strategy that prioritizes sampling trajectories from tasks with high\nlearning potential. These results suggest a promising path towards AI systems\nthat can autonomously solve novel sequential decision-making problems that\nrequire interactions with the external world.\n","authors":["Fahim Tajwar","Yiding Jiang","Abitha Thankaraj","Sumaita Sadia Rahman","J Zico Kolter","Jeff Schneider","Ruslan Salakhutdinov"],"pdf_url":"https://arxiv.org/pdf/2502.17543v2.pdf","comment":"Project Website: https://paprika-llm.github.io"},{"id":"http://arxiv.org/abs/2407.10341v5","updated":"2025-03-05T06:53:17Z","published":"2024-07-14T21:41:29Z","title":"Affordance-Guided Reinforcement Learning via Visual Prompting","summary":" Robots equipped with reinforcement learning (RL) have the potential to learn\na wide range of skills solely from a reward signal. However, obtaining a robust\nand dense reward signal for general manipulation tasks remains a challenge.\nExisting learning-based approaches require significant data, such as human\ndemonstrations of success and failure, to learn task-specific reward functions.\nRecently, there is also a growing adoption of large multi-modal foundation\nmodels for robotics that can perform visual reasoning in physical contexts and\ngenerate coarse robot motions for manipulation tasks. Motivated by this range\nof capability, in this work, we present Keypoint-based Affordance Guidance for\nImprovements (KAGI), a method leveraging rewards shaped by vision-language\nmodels (VLMs) for autonomous RL. State-of-the-art VLMs have demonstrated\nimpressive reasoning about affordances through keypoints in zero-shot, and we\nuse these to define dense rewards that guide autonomous robotic learning. On\nreal-world manipulation tasks specified by natural language descriptions, KAGI\nimproves the sample efficiency of autonomous RL and enables successful task\ncompletion in 30K online fine-tuning steps. Additionally, we demonstrate the\nrobustness of KAGI to reductions in the number of in-domain demonstrations used\nfor pre-training, reaching similar performance in 45K online fine-tuning steps.\nProject website: https://sites.google.com/view/affordance-guided-rl\n","authors":["Olivia Y. Lee","Annie Xie","Kuan Fang","Karl Pertsch","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2407.10341v5.pdf","comment":"8 pages, 6 figures. Robotics: Science and Systems (RSS) 2024, Task\n Specification for General-Purpose Intelligent Robots & Lifelong Robot\n Learning Workshops"},{"id":"http://arxiv.org/abs/2304.04172v2","updated":"2025-03-05T06:51:11Z","published":"2023-04-09T06:18:34Z","title":"$μ^2$-SGD: Stable Stochastic Optimization via a Double Momentum\n Mechanism","summary":" We consider stochastic convex optimization problems where the objective is an\nexpectation over smooth functions. For this setting we suggest a novel gradient\nestimate that combines two recent mechanism that are related to notion of\nmomentum. Then, we design an SGD-style algorithm as well as an accelerated\nversion that make use of this new estimator, and demonstrate the robustness of\nthese new approaches to the choice of the learning rate. Concretely, we show\nthat these approaches obtain the optimal convergence rates for both noiseless\nand noisy case with the same choice of fixed learning rate. Moreover, for the\nnoisy case we show that these approaches achieve the same optimal bound for a\nvery wide range of learning rates.\n","authors":["Tehila Dahan","Kfir Y. Levy"],"pdf_url":"https://arxiv.org/pdf/2304.04172v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.19908v2","updated":"2025-03-05T06:36:27Z","published":"2025-02-27T09:26:22Z","title":"CarPlanner: Consistent Auto-regressive Trajectory Planning for\n Large-scale Reinforcement Learning in Autonomous Driving","summary":" Trajectory planning is vital for autonomous driving, ensuring safe and\nefficient navigation in complex environments. While recent learning-based\nmethods, particularly reinforcement learning (RL), have shown promise in\nspecific scenarios, RL planners struggle with training inefficiencies and\nmanaging large-scale, real-world driving scenarios. In this paper, we introduce\n\\textbf{CarPlanner}, a \\textbf{C}onsistent \\textbf{a}uto-\\textbf{r}egressive\n\\textbf{Planner} that uses RL to generate multi-modal trajectories. The\nauto-regressive structure enables efficient large-scale RL training, while the\nincorporation of consistency ensures stable policy learning by maintaining\ncoherent temporal consistency across time steps. Moreover, CarPlanner employs a\ngeneration-selection framework with an expert-guided reward function and an\ninvariant-view module, simplifying RL training and enhancing policy\nperformance. Extensive analysis demonstrates that our proposed RL framework\neffectively addresses the challenges of training efficiency and performance\nenhancement, positioning CarPlanner as a promising solution for trajectory\nplanning in autonomous driving. To the best of our knowledge, we are the first\nto demonstrate that the RL-based planner can surpass both IL- and rule-based\nstate-of-the-arts (SOTAs) on the challenging large-scale real-world dataset\nnuPlan. Our proposed CarPlanner surpasses RL-, IL-, and rule-based SOTA\napproaches within this demanding dataset.\n","authors":["Dongkun Zhang","Jiaming Liang","Ke Guo","Sha Lu","Qi Wang","Rong Xiong","Zhenwei Miao","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2502.19908v2.pdf","comment":"CVPR 2025"},{"id":"http://arxiv.org/abs/2409.14494v3","updated":"2025-03-05T06:32:04Z","published":"2024-09-13T19:14:18Z","title":"CPT-Boosted Wav2vec2.0: Towards Noise Robust Speech Recognition for\n Classroom Environments","summary":" Creating Automatic Speech Recognition (ASR) systems that are robust and\nresilient to classroom conditions is paramount to the development of AI tools\nto aid teachers and students. In this work, we study the efficacy of continued\npretraining (CPT) in adapting Wav2vec2.0 to the classroom domain. We show that\nCPT is a powerful tool in that regard and reduces the Word Error Rate (WER) of\nWav2vec2.0-based models by upwards of 10%. More specifically, CPT improves the\nmodel's robustness to different noises, microphones and classroom conditions.\n","authors":["Ahmed Adel Attia","Dorottya Demszky","Tolulope Ogunremi","Jing Liu","Carol Espy-Wilson"],"pdf_url":"https://arxiv.org/pdf/2409.14494v3.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2405.13018"},{"id":"http://arxiv.org/abs/2503.03213v1","updated":"2025-03-05T06:11:24Z","published":"2025-03-05T06:11:24Z","title":"Convergence Rates for Softmax Gating Mixture of Experts","summary":" Mixture of experts (MoE) has recently emerged as an effective framework to\nadvance the efficiency and scalability of machine learning models by softly\ndividing complex tasks among multiple specialized sub-models termed experts.\nCentral to the success of MoE is an adaptive softmax gating mechanism which\ntakes responsibility for determining the relevance of each expert to a given\ninput and then dynamically assigning experts their respective weights. Despite\nits widespread use in practice, a comprehensive study on the effects of the\nsoftmax gating on the MoE has been lacking in the literature. To bridge this\ngap in this paper, we perform a convergence analysis of parameter estimation\nand expert estimation under the MoE equipped with the standard softmax gating\nor its variants, including a dense-to-sparse gating and a hierarchical softmax\ngating, respectively. Furthermore, our theories also provide useful insights\ninto the design of sample-efficient expert structures. In particular, we\ndemonstrate that it requires polynomially many data points to estimate experts\nsatisfying our proposed \\emph{strong identifiability} condition, namely a\ncommonly used two-layer feed-forward network. In stark contrast, estimating\nlinear experts, which violate the strong identifiability condition,\nnecessitates exponentially many data points as a result of intrinsic parameter\ninteractions expressed in the language of partial differential equations. All\nthe theoretical results are substantiated with a rigorous guarantee.\n","authors":["Huy Nguyen","Nhat Ho","Alessandro Rinaldo"],"pdf_url":"https://arxiv.org/pdf/2503.03213v1.pdf","comment":"Section 2 of this work comes from our previous paper titled \"On Least\n Square Estimation in Softmax Gating Mixture of Experts\" and published at the\n ICML 2024"},{"id":"http://arxiv.org/abs/2503.03211v1","updated":"2025-03-05T06:06:16Z","published":"2025-03-05T06:06:16Z","title":"NodeReg: Mitigating the Imbalance and Distribution Shift Effects in\n Semi-Supervised Node Classification via Norm Consistency","summary":" Aggregating information from neighboring nodes benefits graph neural networks\n(GNNs) in semi-supervised node classification tasks. Nevertheless, this\nmechanism also renders nodes susceptible to the influence of their neighbors.\nFor instance, this will occur when the neighboring nodes are imbalanced or the\nneighboring nodes contain noise, which can even affect the GNN's ability to\ngeneralize out of distribution. We find that ensuring the consistency of the\nnorm for node representations can significantly reduce the impact of these two\nissues on GNNs. To this end, we propose a regularized optimization method\ncalled NodeReg that enforces the consistency of node representation norms. This\nmethod is simple but effective and satisfies Lipschitz continuity, thus\nfacilitating stable optimization and significantly improving semi-supervised\nnode classification performance under the above two scenarios. To illustrate,\nin the imbalance scenario, when training a GCN with an imbalance ratio of 0.1,\nNodeReg outperforms the most competitive baselines by 1.4%-25.9% in F1 score\nacross five public datasets. Similarly, in the distribution shift scenario,\nNodeReg outperforms the most competitive baseline by 1.4%-3.1% in accuracy.\n","authors":["Shenzhi Yang","Jun Xia","Jingbo Zhou","Xingkai Yao","Xiaofang Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.03211v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.02445v2","updated":"2025-03-05T06:04:37Z","published":"2025-03-04T09:40:00Z","title":"BRIDGE: Bootstrapping Text to Control Time-Series Generation via\n Multi-Agent Iterative Optimization and Diffusion Modelling","summary":" Time-series Generation (TSG) is a prominent research area with broad\napplications in simulations, data augmentation, and counterfactual analysis.\nWhile existing methods have shown promise in unconditional single-domain TSG,\nreal-world applications demand for cross-domain approaches capable of\ncontrolled generation tailored to domain-specific constraints and\ninstance-level requirements. In this paper, we argue that text can provide\nsemantic insights, domain information and instance-specific temporal patterns,\nto guide and improve TSG. We introduce ``Text-Controlled TSG'', a task focused\non generating realistic time series by incorporating textual descriptions. To\naddress data scarcity in this setting, we propose a novel LLM-based Multi-Agent\nframework that synthesizes diverse, realistic text-to-TS datasets. Furthermore,\nwe introduce BRIDGE, a hybrid text-controlled TSG framework that integrates\nsemantic prototypes with text description for supporting domain-level guidance.\nThis approach achieves state-of-the-art generation fidelity on 11 of 12\ndatasets, and improves controllability by 12.52% on MSE and 6.34% MAE compared\nto no text input generation, highlighting its potential for generating tailored\ntime-series data.\n","authors":["Hao Li","Yu-Hao Huang","Chang Xu","Viktor Schlegel","Ren-He Jiang","Riza Batista-Navarro","Goran Nenadic","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2503.02445v2.pdf","comment":"Preprint. Work in progress"},{"id":"http://arxiv.org/abs/2409.15866v3","updated":"2025-03-05T05:55:45Z","published":"2024-09-24T08:40:04Z","title":"Online Planning for Multi-UAV Pursuit-Evasion in Unknown Environments\n Using Deep Reinforcement Learning","summary":" Multi-UAV pursuit-evasion, where pursuers aim to capture evaders, poses a key\nchallenge for UAV swarm intelligence. Multi-agent reinforcement learning (MARL)\nhas demonstrated potential in modeling cooperative behaviors, but most RL-based\napproaches remain constrained to simplified simulations with limited dynamics\nor fixed scenarios. Previous attempts to deploy RL policy to real-world\npursuit-evasion are largely restricted to two-dimensional scenarios, such as\nground vehicles or UAVs at fixed altitudes. In this paper, we address multi-UAV\npursuit-evasion by considering UAV dynamics and physical constraints. We\nintroduce an evader prediction-enhanced network to tackle partial observability\nin cooperative strategy learning. Additionally, we propose an adaptive\nenvironment generator within MARL training, enabling higher exploration\nefficiency and better policy generalization across diverse scenarios.\nSimulations show our method significantly outperforms all baselines in\nchallenging scenarios, generalizing to unseen scenarios with a 100% capture\nrate. Finally, we derive a feasible policy via a two-stage reward refinement\nand deploy the policy on real quadrotors in a zero-shot manner. To our\nknowledge, this is the first work to derive and deploy an RL-based policy using\ncollective thrust and body rates control commands for multi-UAV pursuit-evasion\nin unknown environments. The open-source code and videos are available at\nhttps://sites.google.com/view/pursuit-evasion-rl.\n","authors":["Jiayu Chen","Chao Yu","Guosheng Li","Wenhao Tang","Shilong Ji","Xinyi Yang","Botian Xu","Huazhong Yang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2409.15866v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03206v1","updated":"2025-03-05T05:50:38Z","published":"2025-03-05T05:50:38Z","title":"An Analytical Theory of Power Law Spectral Bias in the Learning Dynamics\n of Diffusion Models","summary":" We developed an analytical framework for understanding how the learned\ndistribution evolves during diffusion model training. Leveraging the Gaussian\nequivalence principle, we derived exact solutions for the gradient-flow\ndynamics of weights in one- or two-layer linear denoiser settings with\narbitrary data. Remarkably, these solutions allowed us to derive the generated\ndistribution in closed form and its KL divergence through training. These\nanalytical results expose a pronounced power-law spectral bias, i.e., for\nweights and distributions, the convergence time of a mode follows an inverse\npower law of its variance. Empirical experiments on both Gaussian and image\ndatasets demonstrate that the power-law spectral bias remains robust even when\nusing deeper or convolutional architectures. Our results underscore the\nimportance of the data covariance in dictating the order and rate at which\ndiffusion models learn different modes of the data, providing potential\nexplanations for why earlier stopping could lead to incorrect details in image\ngenerative models.\n","authors":["Binxu Wang"],"pdf_url":"https://arxiv.org/pdf/2503.03206v1.pdf","comment":"50 pages, 10 figures. Preprint"},{"id":"http://arxiv.org/abs/2411.16746v3","updated":"2025-03-05T05:34:47Z","published":"2024-11-23T20:41:24Z","title":"LoBAM: LoRA-Based Backdoor Attack on Model Merging","summary":" Model merging is an emerging technique that integrates multiple models\nfine-tuned on different tasks to create a versatile model that excels in\nmultiple domains. This scheme, in the meantime, may open up backdoor attack\nopportunities where one single malicious model can jeopardize the integrity of\nthe merged model. Existing works try to demonstrate the risk of such attacks by\nassuming substantial computational resources, focusing on cases where the\nattacker can fully fine-tune the pre-trained model. Such an assumption,\nhowever, may not be feasible given the increasing size of machine learning\nmodels. In practice where resources are limited and the attacker can only\nemploy techniques like Low-Rank Adaptation (LoRA) to produce the malicious\nmodel, it remains unclear whether the attack can still work and pose threats.\nIn this work, we first identify that the attack efficacy is significantly\ndiminished when using LoRA for fine-tuning. Then, we propose LoBAM, a method\nthat yields high attack success rate with minimal training resources. The key\nidea of LoBAM is to amplify the malicious weights in an intelligent way that\neffectively enhances the attack efficacy. We demonstrate that our design can\nlead to improved attack success rate through extensive empirical experiments\nacross various model merging scenarios. Moreover, we show that our method is\nhighly stealthy and is difficult to detect and defend against.\n","authors":["Ming Yin","Jingyang Zhang","Jingwei Sun","Minghong Fang","Hai Li","Yiran Chen"],"pdf_url":"https://arxiv.org/pdf/2411.16746v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03197v1","updated":"2025-03-05T05:30:26Z","published":"2025-03-05T05:30:26Z","title":"Directly Follows Graphs Go Predictive Process Monitoring With Graph\n Neural Networks","summary":" In the past years, predictive process monitoring (PPM) techniques based on\nartificial neural networks have evolved as a method to monitor the future\nbehavior of business processes. Existing approaches mostly focus on\ninterpreting the processes as sequences, so-called traces, and feeding them to\nneural architectures designed to operate on sequential data such as recurrent\nneural networks (RNNs) or transformers. In this study, we investigate an\nalternative way to perform PPM: by transforming each process in its\ndirectly-follows-graph (DFG) representation we are able to apply graph neural\nnetworks (GNNs) for the prediction tasks. By this, we aim to develop models\nthat are more suitable for complex processes that are long and contain an\nabundance of loops. In particular, we present different ways to create DFG\nrepresentations depending on the particular GNN we use. The tested GNNs range\nfrom classical node-based to novel edge-based architectures. Further, we\ninvestigate the possibility of using multi-graphs. By these steps, we aim to\ndesign graph representations that minimize the information loss when\ntransforming traces into graphs.\n","authors":["Attila Lischka","Simon Rauch","Oliver Stritzel"],"pdf_url":"https://arxiv.org/pdf/2503.03197v1.pdf","comment":"10 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2503.03195v1","updated":"2025-03-05T05:25:54Z","published":"2025-03-05T05:25:54Z","title":"Online Bidding under RoS Constraints without Knowing the Value","summary":" We consider the problem of bidding in online advertising, where an advertiser\naims to maximize value while adhering to budget and Return-on-Spend (RoS)\nconstraints. Unlike prior work that assumes knowledge of the value generated by\nwinning each impression ({e.g.,} conversions), we address the more realistic\nsetting where the advertiser must simultaneously learn the optimal bidding\nstrategy and the value of each impression opportunity. This introduces a\nchallenging exploration-exploitation dilemma: the advertiser must balance\nexploring different bids to estimate impression values with exploiting current\nknowledge to bid effectively. To address this, we propose a novel Upper\nConfidence Bound (UCB)-style algorithm that carefully manages this trade-off.\nVia a rigorous theoretical analysis, we prove that our algorithm achieves\n$\\widetilde{O}(\\sqrt{T\\log(|\\mathcal{B}|T)})$ regret and constraint violation,\nwhere $T$ is the number of bidding rounds and $\\mathcal{B}$ is the domain of\npossible bids. This establishes the first optimal regret and constraint\nviolation bounds for bidding in the online setting with unknown impression\nvalues. Moreover, our algorithm is computationally efficient and simple to\nimplement. We validate our theoretical findings through experiments on\nsynthetic data, demonstrating that our algorithm exhibits strong empirical\nperformance compared to existing approaches.\n","authors":["Sushant Vijayan","Zhe Feng","Swati Padmanabhan","Karthikeyan Shanmugam","Arun Suggala","Di Wang"],"pdf_url":"https://arxiv.org/pdf/2503.03195v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.01478v3","updated":"2025-03-05T05:24:54Z","published":"2025-03-03T12:37:34Z","title":"SePer: Measure Retrieval Utility Through The Lens Of Semantic Perplexity\n Reduction","summary":" Large Language Models (LLMs) have demonstrated improved generation\nperformance by incorporating externally retrieved knowledge, a process known as\nretrieval-augmented generation (RAG). Despite the potential of this approach,\nexisting studies evaluate RAG effectiveness by 1) assessing retrieval and\ngeneration components jointly, which obscures retrieval's distinct\ncontribution, or 2) examining retrievers using traditional metrics such as\nNDCG, which creates a gap in understanding retrieval's true utility in the\noverall generation process. To address the above limitations, in this work, we\nintroduce an automatic evaluation method that measures retrieval quality\nthrough the lens of information gain within the RAG framework. Specifically, we\npropose Semantic Perplexity (SePer), a metric that captures the LLM's internal\nbelief about the correctness of the retrieved information. We quantify the\nutility of retrieval by the extent to which it reduces semantic perplexity\npost-retrieval. Extensive experiments demonstrate that SePer not only aligns\nclosely with human preferences but also offers a more precise and efficient\nevaluation of retrieval utility across diverse RAG scenarios.\n","authors":["Lu Dai","Yijie Xu","Jinhui Ye","Hao Liu","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2503.01478v3.pdf","comment":"ICLR 2025 Spotlight"}],"Multimedia":[{"id":"http://arxiv.org/abs/2410.08642v2","updated":"2025-03-05T15:55:52Z","published":"2024-10-11T09:10:26Z","title":"More than Memes: A Multimodal Topic Modeling Approach to Conspiracy\n Theories on Telegram","summary":" To address the increasing prevalence of (audio-)visual data on social media,\nand to capture the evolving and dynamic nature of this communication,\nresearchers have begun to explore the potential of unsupervised approaches for\nanalyzing multimodal online content. However, existing research often neglects\nvisual content beyond memes, and in addition lacks methods to compare topic\nmodels across modalities. Our study addresses these gaps by applying multimodal\ntopic modeling for analyzing conspiracy theories in German-language Telegram\nchannels. We use BERTopic with CLIP for the analysis of textual and visual data\nin a corpus of ~40, 000 Telegram messages posted in October 2023 in 571\nGerman-language Telegram channels known for disseminating conspiracy theories.\nThrough this dataset, we provide insights into unimodal and multimodal topic\nmodels by analyzing symmetry and intersections of topics across modalities. We\ndemonstrate the variety of textual and visual content shared in the channels\ndiscovered through the topic modeling, and propose a conceptual framework for\nthe analysis of textual and visual discursive strategies in the communication\nof conspiracy theories. We apply the framework in a case study of the topic\ngroup Israel Gaza.\n","authors":["Elisabeth Steffen"],"pdf_url":"https://arxiv.org/pdf/2410.08642v2.pdf","comment":"12 pages, 10 figures"},{"id":"http://arxiv.org/abs/2410.07369v3","updated":"2025-03-05T00:06:53Z","published":"2024-10-09T18:33:06Z","title":"An Undetectable Watermark for Generative Image Models","summary":" We present the first undetectable watermarking scheme for generative image\nmodels. Undetectability ensures that no efficient adversary can distinguish\nbetween watermarked and un-watermarked images, even after making many adaptive\nqueries. In particular, an undetectable watermark does not degrade image\nquality under any efficiently computable metric. Our scheme works by selecting\nthe initial latents of a diffusion model using a pseudorandom error-correcting\ncode (Christ and Gunn, 2024), a strategy which guarantees undetectability and\nrobustness. We experimentally demonstrate that our watermarks are\nquality-preserving and robust using Stable Diffusion 2.1. Our experiments\nverify that, in contrast to every prior scheme we tested, our watermark does\nnot degrade image quality. Our experiments also demonstrate robustness:\nexisting watermark removal attacks fail to remove our watermark from images\nwithout significantly degrading the quality of the images. Finally, we find\nthat we can robustly encode 512 bits in our watermark, and up to 2500 bits when\nthe images are not subjected to watermark removal attacks. Our code is\navailable at https://github.com/XuandongZhao/PRC-Watermark.\n","authors":["Sam Gunn","Xuandong Zhao","Dawn Song"],"pdf_url":"https://arxiv.org/pdf/2410.07369v3.pdf","comment":"ICLR 2025"}]},"2025-03-06T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2503.04720v1","updated":"2025-03-06T18:59:06Z","published":"2025-03-06T18:59:06Z","title":"FluidNexus: 3D Fluid Reconstruction and Prediction from a Single Video","summary":" We study reconstructing and predicting 3D fluid appearance and velocity from\na single video. Current methods require multi-view videos for fluid\nreconstruction. We present FluidNexus, a novel framework that bridges video\ngeneration and physics simulation to tackle this task. Our key insight is to\nsynthesize multiple novel-view videos as references for reconstruction.\nFluidNexus consists of two key components: (1) a novel-view video synthesizer\nthat combines frame-wise view synthesis with video diffusion refinement for\ngenerating realistic videos, and (2) a physics-integrated particle\nrepresentation coupling differentiable simulation and rendering to\nsimultaneously facilitate 3D fluid reconstruction and prediction. To evaluate\nour approach, we collect two new real-world fluid datasets featuring textured\nbackgrounds and object interactions. Our method enables dynamic novel view\nsynthesis, future prediction, and interaction simulation from a single fluid\nvideo. Project website: https://yuegao.me/FluidNexus.\n","authors":["Yue Gao","Hong-Xing Yu","Bo Zhu","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2503.04720v1.pdf","comment":"CVPR 2025. Project website: https://yuegao.me/FluidNexus"},{"id":"http://arxiv.org/abs/2503.04718v1","updated":"2025-03-06T18:58:45Z","published":"2025-03-06T18:58:45Z","title":"Floxels: Fast Unsupervised Voxel Based Scene Flow Estimation","summary":" Scene flow estimation is a foundational task for many robotic applications,\nincluding robust dynamic object detection, automatic labeling, and sensor\nsynchronization. Two types of approaches to the problem have evolved: 1)\nSupervised and 2) optimization-based methods. Supervised methods are fast\nduring inference and achieve high-quality results, however, they are limited by\nthe need for large amounts of labeled training data and are susceptible to\ndomain gaps. In contrast, unsupervised test-time optimization methods do not\nface the problem of domain gaps but usually suffer from substantial runtime,\nexhibit artifacts, or fail to converge to the right solution. In this work, we\nmitigate several limitations of existing optimization-based methods. To this\nend, we 1) introduce a simple voxel grid-based model that improves over the\nstandard MLP-based formulation in multiple dimensions and 2) introduce a new\nmultiframe loss formulation. 3) We combine both contributions in our new\nmethod, termed Floxels. On the Argoverse 2 benchmark, Floxels is surpassed only\nby EulerFlow among unsupervised methods while achieving comparable performance\nat a fraction of the computational cost. Floxels achieves a massive speedup of\nmore than ~60 - 140x over EulerFlow, reducing the runtime from a day to 10\nminutes per sequence. Over the faster but low-quality baseline, NSFP, Floxels\nachieves a speedup of ~14x.\n","authors":["David T. Hoffmann","Syed Haseeb Raza","Hanqiu Jiang","Denis Tananaev","Steffen Klingenhoefer","Martin Meinke"],"pdf_url":"https://arxiv.org/pdf/2503.04718v1.pdf","comment":"Accepted at CVPR 2025"},{"id":"http://arxiv.org/abs/2503.04707v1","updated":"2025-03-06T18:55:21Z","published":"2025-03-06T18:55:21Z","title":"Iris Style Transfer: Enhancing Iris Recognition with Style Features and\n Privacy Preservation through Neural Style Transfer","summary":" Iris texture is widely regarded as a gold standard biometric modality for\nauthentication and identification. The demand for robust iris recognition\nmethods, coupled with growing security and privacy concerns regarding iris\nattacks, has escalated recently. Inspired by neural style transfer, an advanced\ntechnique that leverages neural networks to separate content and style\nfeatures, we hypothesize that iris texture's style features provide a reliable\nfoundation for recognition and are more resilient to variations like rotation\nand perspective shifts than traditional approaches. Our experimental results\nsupport this hypothesis, showing a significantly higher classification accuracy\ncompared to conventional features. Further, we propose using neural style\ntransfer to mask identifiable iris style features, ensuring the protection of\nsensitive biometric information while maintaining the utility of eye images for\ntasks like eye segmentation and gaze estimation. This work opens new avenues\nfor iris-oriented, secure, and privacy-aware biometric systems.\n","authors":["Mengdi Wang","Efe Bozkir","Enkelejda Kasneci"],"pdf_url":"https://arxiv.org/pdf/2503.04707v1.pdf","comment":"14 pages main paper, 4 pages appendix"},{"id":"http://arxiv.org/abs/2503.04698v1","updated":"2025-03-06T18:46:10Z","published":"2025-03-06T18:46:10Z","title":"DEAL-YOLO: Drone-based Efficient Animal Localization using YOLO","summary":" Although advances in deep learning and aerial surveillance technology are\nimproving wildlife conservation efforts, complex and erratic environmental\nconditions still pose a problem, requiring innovative solutions for\ncost-effective small animal detection. This work introduces DEAL-YOLO, a novel\napproach that improves small object detection in Unmanned Aerial Vehicle (UAV)\nimages by using multi-objective loss functions like Wise IoU (WIoU) and\nNormalized Wasserstein Distance (NWD), which prioritize pixels near the centre\nof the bounding box, ensuring smoother localization and reducing abrupt\ndeviations. Additionally, the model is optimized through efficient feature\nextraction with Linear Deformable (LD) convolutions, enhancing accuracy while\nmaintaining computational efficiency. The Scaled Sequence Feature Fusion (SSFF)\nmodule enhances object detection by effectively capturing inter-scale\nrelationships, improving feature representation, and boosting metrics through\noptimized multiscale fusion. Comparison with baseline models reveals high\nefficacy with up to 69.5\\% fewer parameters compared to vanilla Yolov8-N,\nhighlighting the robustness of the proposed modifications. Through this\napproach, our paper aims to facilitate the detection of endangered species,\nanimal population analysis, habitat monitoring, biodiversity research, and\nvarious other applications that enrich wildlife conservation efforts. DEAL-YOLO\nemploys a two-stage inference paradigm for object detection, refining selected\nregions to improve localization and confidence. This approach enhances\nperformance, especially for small instances with low objectness scores.\n","authors":["Aditya Prashant Naidu","Hem Gosalia","Ishaan Gakhar","Shaurya Singh Rathore","Krish Didwania","Ujjwal Verma"],"pdf_url":"https://arxiv.org/pdf/2503.04698v1.pdf","comment":"Accepted as a Poster at the ML4RS Workshop at ICLR 2025"},{"id":"http://arxiv.org/abs/2503.04688v1","updated":"2025-03-06T18:31:41Z","published":"2025-03-06T18:31:41Z","title":"Teach YOLO to Remember: A Self-Distillation Approach for Continual\n Object Detection","summary":" Real-time object detectors like YOLO achieve exceptional performance when\ntrained on large datasets for multiple epochs. However, in real-world scenarios\nwhere data arrives incrementally, neural networks suffer from catastrophic\nforgetting, leading to a loss of previously learned knowledge. To address this,\nprior research has explored strategies for Class Incremental Learning (CIL) in\nContinual Learning for Object Detection (CLOD), with most approaches focusing\non two-stage object detectors. However, existing work suggests that Learning\nwithout Forgetting (LwF) may be ineffective for one-stage anchor-free detectors\nlike YOLO due to noisy regression outputs, which risk transferring corrupted\nknowledge. In this work, we introduce YOLO LwF, a self-distillation approach\ntailored for YOLO-based continual object detection. We demonstrate that when\ncoupled with a replay memory, YOLO LwF significantly mitigates forgetting.\nCompared to previous approaches, it achieves state-of-the-art performance,\nimproving mAP by +2.1% and +2.9% on the VOC and COCO benchmarks, respectively.\n","authors":["Riccardo De Monte","Davide Dalle Pezze","Gian Antonio Susto"],"pdf_url":"https://arxiv.org/pdf/2503.04688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.12360v2","updated":"2025-03-06T18:07:00Z","published":"2025-02-17T22:50:45Z","title":"Detecting Systematic Weaknesses in Vision Models along Predefined\n Human-Understandable Dimensions","summary":" Slice discovery methods (SDMs) are prominent algorithms for finding\nsystematic weaknesses in DNNs. They identify top-k semantically coherent\nslices/subsets of data where a DNN-under-test has low performance. For being\ndirectly useful, slices should be aligned with human-understandable and\nrelevant dimensions, which, for example, are defined by safety and domain\nexperts as part of the operational design domain (ODD). While SDMs can be\napplied effectively on structured data, their application on image data is\ncomplicated by the lack of semantic metadata. To address these issues, we\npresent an algorithm that combines foundation models for zero-shot image\nclassification to generate semantic metadata with methods for combinatorial\nsearch to find systematic weaknesses in images. In contrast to existing\napproaches, ours identifies weak slices that are in line with pre-defined\nhuman-understandable dimensions. As the algorithm includes foundation models,\nits intermediate and final results may not always be exact. Therefore, we\ninclude an approach to address the impact of noisy metadata. We validate our\nalgorithm on both synthetic and real-world datasets, demonstrating its ability\nto recover human-understandable systematic weaknesses. Furthermore, using our\napproach, we identify systematic weaknesses of multiple pre-trained and\npublicly available state-of-the-art computer vision DNNs.\n","authors":["Sujan Sai Gannamaneni","Rohil Prakash Rao","Michael Mock","Maram Akila","Stefan Wrobel"],"pdf_url":"https://arxiv.org/pdf/2502.12360v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04666v1","updated":"2025-03-06T17:59:29Z","published":"2025-03-06T17:59:29Z","title":"What Are You Doing? A Closer Look at Controllable Human Video Generation","summary":" High-quality benchmarks are crucial for driving progress in machine learning\nresearch. However, despite the growing interest in video generation, there is\nno comprehensive dataset to evaluate human generation. Humans can perform a\nwide variety of actions and interactions, but existing datasets, like TikTok\nand TED-Talks, lack the diversity and complexity to fully capture the\ncapabilities of video generation models. We close this gap by introducing `What\nAre You Doing?' (WYD): a new benchmark for fine-grained evaluation of\ncontrollable image-to-video generation of humans. WYD consists of 1{,}544\ncaptioned videos that have been meticulously collected and annotated with 56\nfine-grained categories. These allow us to systematically measure performance\nacross 9 aspects of human generation, including actions, interactions and\nmotion. We also propose and validate automatic metrics that leverage our\nannotations and better capture human evaluations. Equipped with our dataset and\nmetrics, we perform in-depth analyses of seven state-of-the-art models in\ncontrollable image-to-video generation, showing how WYD provides novel insights\nabout the capabilities of these models. We release our data and code to drive\nforward progress in human video generation modeling at\nhttps://github.com/google-deepmind/wyd-benchmark.\n","authors":["Emanuele Bugliarello","Anurag Arnab","Roni Paiss","Pieter-Jan Kindermans","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2503.04666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04665v1","updated":"2025-03-06T17:58:55Z","published":"2025-03-06T17:58:55Z","title":"Implicit Neural Representation for Video and Image Super-Resolution","summary":" We present a novel approach for super-resolution that utilizes implicit\nneural representation (INR) to effectively reconstruct and enhance\nlow-resolution videos and images. By leveraging the capacity of neural networks\nto implicitly encode spatial and temporal features, our method facilitates\nhigh-resolution reconstruction using only low-resolution inputs and a 3D\nhigh-resolution grid. This results in an efficient solution for both image and\nvideo super-resolution. Our proposed method, SR-INR, maintains consistent\ndetails across frames and images, achieving impressive temporal stability\nwithout relying on the computationally intensive optical flow or motion\nestimation typically used in other video super-resolution techniques. The\nsimplicity of our approach contrasts with the complexity of many existing\nmethods, making it both effective and efficient. Experimental evaluations show\nthat SR-INR delivers results on par with or superior to state-of-the-art\nsuper-resolution methods, while maintaining a more straightforward structure\nand reduced computational demands. These findings highlight the potential of\nimplicit neural representations as a powerful tool for reconstructing\nhigh-quality, temporally consistent video and image signals from low-resolution\ndata.\n","authors":["Mary Aiyetigbo","Wanqi Yuan","Feng Luo","Nianyi Li"],"pdf_url":"https://arxiv.org/pdf/2503.04665v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.09696v2","updated":"2025-03-06T17:45:33Z","published":"2025-02-13T18:59:11Z","title":"ZeroBench: An Impossible Visual Benchmark for Contemporary Large\n Multimodal Models","summary":" Large Multimodal Models (LMMs) exhibit major shortfalls when interpreting\nimages and, by some measures, have poorer spatial cognition than small children\nor animals. Despite this, they attain high scores on many popular visual\nbenchmarks, with headroom rapidly eroded by an ongoing surge of model progress.\nTo address this, there is a pressing need for difficult benchmarks that remain\nrelevant for longer. We take this idea to its limit by introducing ZeroBench-a\nlightweight visual reasoning benchmark that is entirely impossible for\ncontemporary frontier LMMs. Our benchmark consists of 100 manually curated\nquestions and 334 less difficult subquestions. We evaluate 20 LMMs on\nZeroBench, all of which score 0.0%, and rigorously analyse the errors. To\nencourage progress in visual understanding, we publicly release ZeroBench.\n","authors":["Jonathan Roberts","Mohammad Reza Taesiri","Ansh Sharma","Akash Gupta","Samuel Roberts","Ioana Croitoru","Simion-Vlad Bogolin","Jialu Tang","Florian Langer","Vyas Raina","Vatsal Raina","Hanyi Xiong","Vishaal Udandarao","Jingyi Lu","Shiyang Chen","Sam Purkis","Tianshuo Yan","Wenye Lin","Gyungin Shin","Qiaochu Yang","Anh Totti Nguyen","David I. Atkinson","Aaditya Baranwal","Alexandru Coca","Mikah Dang","Sebastian Dziadzio","Jakob D. Kunz","Kaiqu Liang","Alexander Lo","Brian Pulfer","Steven Walton","Charig Yang","Kai Han","Samuel Albanie"],"pdf_url":"https://arxiv.org/pdf/2502.09696v2.pdf","comment":"20 pages, 13 figures"},{"id":"http://arxiv.org/abs/2503.04653v1","updated":"2025-03-06T17:43:03Z","published":"2025-03-06T17:43:03Z","title":"RadIR: A Scalable Framework for Multi-Grained Medical Image Retrieval\n via Radiology Report Mining","summary":" Developing advanced medical imaging retrieval systems is challenging due to\nthe varying definitions of `similar images' across different medical contexts.\nThis challenge is compounded by the lack of large-scale, high-quality medical\nimaging retrieval datasets and benchmarks. In this paper, we propose a novel\nmethodology that leverages dense radiology reports to define image-wise\nsimilarity ordering at multiple granularities in a scalable and fully automatic\nmanner. Using this approach, we construct two comprehensive medical imaging\nretrieval datasets: MIMIC-IR for Chest X-rays and CTRATE-IR for CT scans,\nproviding detailed image-image ranking annotations conditioned on diverse\nanatomical structures. Furthermore, we develop two retrieval systems, RadIR-CXR\nand model-ChestCT, which demonstrate superior performance in traditional\nimage-image and image-report retrieval tasks. These systems also enable\nflexible, effective image retrieval conditioned on specific anatomical\nstructures described in text, achieving state-of-the-art results on 77 out of\n78 metrics.\n","authors":["Tengfei Zhang","Ziheng Zhao","Chaoyi Wu","Xiao Zhou","Ya Zhang","Yangfeng Wang","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2503.04653v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04649v1","updated":"2025-03-06T17:35:37Z","published":"2025-03-06T17:35:37Z","title":"Transferable Foundation Models for Geometric Tasks on Point Cloud\n Representations: Geometric Neural Operators","summary":" We introduce methods for obtaining pretrained Geometric Neural Operators\n(GNPs) that can serve as basal foundation models for use in obtaining geometric\nfeatures. These can be used within data processing pipelines for machine\nlearning tasks and numerical methods. We show how our GNPs can be trained to\nlearn robust latent representations for the differential geometry of\npoint-clouds to provide estimates of metric, curvature, and other shape-related\nfeatures. We demonstrate how our pre-trained GNPs can be used (i) to estimate\nthe geometric properties of surfaces of arbitrary shape and topologies with\nrobustness in the presence of noise, (ii) to approximate solutions of geometric\npartial differential equations (PDEs) on manifolds, and (iii) to solve\nequations for shape deformations such as curvature driven flows. We also\nrelease a package of the codes and weights for using our pre-trained GNPs for\nprocessing point cloud representations. This allows for incorporating our\npre-trained GNPs as components for reuse within existing and new data\nprocessing pipelines. The GNPs also can be used as part of numerical solvers\ninvolving geometry or as part of methods for performing inference and other\ngeometric tasks.\n","authors":["Blaine Quackenbush","Paul J. Atzberger"],"pdf_url":"https://arxiv.org/pdf/2503.04649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04873v2","updated":"2025-03-06T17:35:19Z","published":"2025-01-08T23:07:10Z","title":"Back Home: A Machine Learning Approach to Seashell Classification and\n Ecosystem Restoration","summary":" In Costa Rica, an average of 5 tons of seashells are extracted from\necosystems annually. Confiscated seashells, cannot be returned to their\necosystems due to the lack of origin recognition. To address this issue, we\ndeveloped a convolutional neural network (CNN) specifically for seashell\nidentification. We built a dataset from scratch, consisting of approximately\n19000 images from the Pacific and Caribbean coasts. Using this dataset, the\nmodel achieved a classification accuracy exceeding 85%. The model has been\nintegrated into a user-friendly application, which has classified over 36,000\nseashells to date, delivering real-time results within 3 seconds per image. To\nfurther enhance the system's accuracy, an anomaly detection mechanism was\nincorporated to filter out irrelevant or anomalous inputs, ensuring only valid\nseashell images are processed.\n","authors":["Alexander Valverde","Luis Solano"],"pdf_url":"https://arxiv.org/pdf/2501.04873v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04643v1","updated":"2025-03-06T17:32:15Z","published":"2025-03-06T17:32:15Z","title":"Adaptive Prototype Learning for Multimodal Cancer Survival Analysis","summary":" Leveraging multimodal data, particularly the integration of whole-slide\nhistology images (WSIs) and transcriptomic profiles, holds great promise for\nimproving cancer survival prediction. However, excessive redundancy in\nmultimodal data can degrade model performance. In this paper, we propose\nAdaptive Prototype Learning (APL), a novel and effective approach for\nmultimodal cancer survival analysis. APL adaptively learns representative\nprototypes in a data-driven manner, reducing redundancy while preserving\ncritical information. Our method employs two sets of learnable query vectors\nthat serve as a bridge between high-dimensional representations and survival\nprediction, capturing task-relevant features. Additionally, we introduce a\nmultimodal mixed self-attention mechanism to enable cross-modal interactions,\nfurther enhancing information fusion. Extensive experiments on five benchmark\ncancer datasets demonstrate the superiority of our approach over existing\nmethods. The code is available at https://github.com/HongLiuuuuu/APL.\n","authors":["Hong Liu","Haosen Yang","Federica Eduati","Josien P. W. Pluim","Mitko Veta"],"pdf_url":"https://arxiv.org/pdf/2503.04643v1.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2503.04641v1","updated":"2025-03-06T17:31:43Z","published":"2025-03-06T17:31:43Z","title":"Simulating the Real World: A Unified Survey of Multimodal Generative\n Models","summary":" Understanding and replicating the real world is a critical challenge in\nArtificial General Intelligence (AGI) research. To achieve this, many existing\napproaches, such as world models, aim to capture the fundamental principles\ngoverning the physical world, enabling more accurate simulations and meaningful\ninteractions. However, current methods often treat different modalities,\nincluding 2D (images), videos, 3D, and 4D representations, as independent\ndomains, overlooking their interdependencies. Additionally, these methods\ntypically focus on isolated dimensions of reality without systematically\nintegrating their connections. In this survey, we present a unified survey for\nmultimodal generative models that investigate the progression of data\ndimensionality in real-world simulation. Specifically, this survey starts from\n2D generation (appearance), then moves to video (appearance+dynamics) and 3D\ngeneration (appearance+geometry), and finally culminates in 4D generation that\nintegrate all dimensions. To the best of our knowledge, this is the first\nattempt to systematically unify the study of 2D, video, 3D and 4D generation\nwithin a single framework. To guide future research, we provide a comprehensive\nreview of datasets, evaluation metrics and future directions, and fostering\ninsights for newcomers. This survey serves as a bridge to advance the study of\nmultimodal generative models and real-world simulation within a unified\nframework.\n","authors":["Yuqi Hu","Longguang Wang","Xian Liu","Ling-Hao Chen","Yuwei Guo","Yukai Shi","Ce Liu","Anyi Rao","Zeyu Wang","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2503.04641v1.pdf","comment":"Repository for the related papers at\n https://github.com/ALEEEHU/World-Simulator"},{"id":"http://arxiv.org/abs/2503.04639v1","updated":"2025-03-06T17:28:48Z","published":"2025-03-06T17:28:48Z","title":"Enhancing SAM with Efficient Prompting and Preference Optimization for\n Semi-supervised Medical Image Segmentation","summary":" Foundational models such as the Segment Anything Model (SAM) are gaining\ntraction in medical imaging segmentation, supporting multiple downstream tasks.\nHowever, such models are supervised in nature, still relying on large annotated\ndatasets or prompts supplied by experts. Conventional techniques such as active\nlearning to alleviate such limitations are limited in scope and still\nnecessitate continuous human involvement and complex domain knowledge for label\nrefinement or establishing reward ground truth. To address these challenges, we\npropose an enhanced Segment Anything Model (SAM) framework that utilizes\nannotation-efficient prompts generated in a fully unsupervised fashion, while\nstill capturing essential semantic, location, and shape information through\ncontrastive language-image pretraining and visual question answering. We adopt\nthe direct preference optimization technique to design an optimal policy that\nenables the model to generate high-fidelity segmentations with simple ratings\nor rankings provided by a virtual annotator simulating the human annotation\nprocess. State-of-the-art performance of our framework in tasks such as lung\nsegmentation, breast tumor segmentation, and organ segmentation across various\nmodalities, including X-ray, ultrasound, and abdominal CT, justifies its\neffectiveness in low-annotation data scenarios.\n","authors":["Aishik Konwer","Zhijian Yang","Erhan Bas","Cao Xiao","Prateek Prasanna","Parminder Bhatia","Taha Kass-Hout"],"pdf_url":"https://arxiv.org/pdf/2503.04639v1.pdf","comment":"Accepted to CVPR 2025"},{"id":"http://arxiv.org/abs/2503.04635v1","updated":"2025-03-06T17:23:55Z","published":"2025-03-06T17:23:55Z","title":"3HANDS Dataset: Learning from Humans for Generating Naturalistic\n Handovers with Supernumerary Robotic Limbs","summary":" Supernumerary robotic limbs (SRLs) are robotic structures integrated closely\nwith the user's body, which augment human physical capabilities and necessitate\nseamless, naturalistic human-machine interaction. For effective assistance in\nphysical tasks, enabling SRLs to hand over objects to humans is crucial. Yet,\ndesigning heuristic-based policies for robots is time-consuming, difficult to\ngeneralize across tasks, and results in less human-like motion. When trained\nwith proper datasets, generative models are powerful alternatives for creating\nnaturalistic handover motions. We introduce 3HANDS, a novel dataset of object\nhandover interactions between a participant performing a daily activity and\nanother participant enacting a hip-mounted SRL in a naturalistic manner. 3HANDS\ncaptures the unique characteristics of SRL interactions: operating in intimate\npersonal space with asymmetric object origins, implicit motion synchronization,\nand the user's engagement in a primary task during the handover. To demonstrate\nthe effectiveness of our dataset, we present three models: one that generates\nnaturalistic handover trajectories, another that determines the appropriate\nhandover endpoints, and a third that predicts the moment to initiate a\nhandover. In a user study (N=10), we compare the handover interaction performed\nwith our method compared to a baseline. The findings show that our method was\nperceived as significantly more natural, less physically demanding, and more\ncomfortable.\n","authors":["Artin Saberpour Abadian","Yi-Chi Liao","Ata Otaran","Rishabh Dabral","Marie Muehlhaus","Christian Theobalt","Martin Schmitz","Jürgen Steimle"],"pdf_url":"https://arxiv.org/pdf/2503.04635v1.pdf","comment":"CHI '25"},{"id":"http://arxiv.org/abs/2503.04634v1","updated":"2025-03-06T17:21:12Z","published":"2025-03-06T17:21:12Z","title":"PathoPainter: Augmenting Histopathology Segmentation via Tumor-aware\n Inpainting","summary":" Tumor segmentation plays a critical role in histopathology, but it requires\ncostly, fine-grained image-mask pairs annotated by pathologists. Thus,\nsynthesizing histopathology data to expand the dataset is highly desirable.\nPrevious works suffer from inaccuracies and limited diversity in image-mask\npairs, both of which affect training segmentation, particularly in small-scale\ndatasets and the inherently complex nature of histopathology images. To address\nthis challenge, we propose PathoPainter, which reformulates image-mask pair\ngeneration as a tumor inpainting task. Specifically, our approach preserves the\nbackground while inpainting the tumor region, ensuring precise alignment\nbetween the generated image and its corresponding mask. To enhance dataset\ndiversity while maintaining biological plausibility, we incorporate a sampling\nmechanism that conditions tumor inpainting on regional embeddings from a\ndifferent image. Additionally, we introduce a filtering strategy to exclude\nuncertain synthetic regions, further improving the quality of the generated\ndata. Our comprehensive evaluation spans multiple datasets featuring diverse\ntumor types and various training data scales. As a result, segmentation\nimproved significantly with our synthetic data, surpassing existing\nsegmentation data synthesis approaches, e.g., 75.69% -> 77.69% on CAMELYON16.\nThe code is available at https://github.com/HongLiuuuuu/PathoPainter.\n","authors":["Hong Liu","Haosen Yang","Evi M. C. Huijben","Mark Schuiveling","Ruisheng Su","Josien P. W. Pluim","Mitko Veta"],"pdf_url":"https://arxiv.org/pdf/2503.04634v1.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2503.00897v3","updated":"2025-03-06T17:19:22Z","published":"2025-03-02T13:43:53Z","title":"A Simple and Effective Reinforcement Learning Method for Text-to-Image\n Diffusion Fine-tuning","summary":" Reinforcement learning (RL)-based fine-tuning has emerged as a powerful\napproach for aligning diffusion models with black-box objectives. Proximal\npolicy optimization (PPO) is the most popular choice of method for policy\noptimization. While effective in terms of performance, PPO is highly sensitive\nto hyper-parameters and involves substantial computational overhead. REINFORCE,\non the other hand, mitigates some computational complexities such as high\nmemory overhead and sensitive hyper-parameter tuning, but has suboptimal\nperformance due to high-variance and sample inefficiency. While the variance of\nthe REINFORCE can be reduced by sampling multiple actions per input prompt and\nusing a baseline correction term, it still suffers from sample inefficiency. To\naddress these challenges, we systematically analyze the\nefficiency-effectiveness trade-off between REINFORCE and PPO, and propose\nleave-one-out PPO (LOOP), a novel RL for diffusion fine-tuning method. LOOP\ncombines variance reduction techniques from REINFORCE, such as sampling\nmultiple actions per input prompt and a baseline correction term, with the\nrobustness and sample efficiency of PPO via clipping and importance sampling.\nOur results demonstrate that LOOP effectively improves diffusion models on\nvarious black-box objectives, and achieves a better balance between\ncomputational efficiency and performance.\n","authors":["Shashank Gupta","Chaitanya Ahuja","Tsung-Yu Lin","Sreya Dutta Roy","Harrie Oosterhuis","Maarten de Rijke","Satya Narayan Shukla"],"pdf_url":"https://arxiv.org/pdf/2503.00897v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12833v2","updated":"2025-03-06T17:18:49Z","published":"2024-05-21T14:37:35Z","title":"A Survey of Deep Learning-based Radiology Report Generation Using\n Multimodal Data","summary":" Automatic radiology report generation can alleviate the workload for\nphysicians and minimize regional disparities in medical resources, therefore\nbecoming an important topic in the medical image analysis field. It is a\nchallenging task, as the computational model needs to mimic physicians to\nobtain information from multi-modal input data (i.e., medical images, clinical\ninformation, medical knowledge, etc.), and produce comprehensive and accurate\nreports. Recently, numerous works have emerged to address this issue using\ndeep-learning-based methods, such as transformers, contrastive learning, and\nknowledge-base construction. This survey summarizes the key techniques\ndeveloped in the most recent works and proposes a general workflow for\ndeep-learning-based report generation with five main components, including\nmulti-modality data acquisition, data preparation, feature learning, feature\nfusion and interaction, and report generation. The state-of-the-art methods for\neach of these components are highlighted. Additionally, we summarize the latest\ndevelopments in large model-based methods and model explainability, along with\npublic datasets, evaluation methods, current challenges, and future directions\nin this field. We have also conducted a quantitative comparison between\ndifferent methods in the same experimental setting. This is the most up-to-date\nsurvey that focuses on multi-modality inputs and data fusion for radiology\nreport generation. The aim is to provide comprehensive and rich information for\nresearchers interested in automatic clinical report generation and medical\nimage analysis, especially when using multimodal inputs, and to assist them in\ndeveloping new algorithms to advance the field.\n","authors":["Xinyi Wang","Grazziela Figueredo","Ruizhe Li","Wei Emma Zhang","Weitong Chen","Xin Chen"],"pdf_url":"https://arxiv.org/pdf/2405.12833v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11919v3","updated":"2025-03-06T17:12:48Z","published":"2024-09-18T12:32:25Z","title":"LLM-wrapper: Black-Box Semantic-Aware Adaptation of Vision-Language\n Models for Referring Expression Comprehension","summary":" Vision Language Models (VLMs) have demonstrated remarkable capabilities in\nvarious open-vocabulary tasks, yet their zero-shot performance lags behind\ntask-specific fine-tuned models, particularly in complex tasks like Referring\nExpression Comprehension (REC). Fine-tuning usually requires 'white-box' access\nto the model's architecture and weights, which is not always feasible due to\nproprietary or privacy concerns. In this work, we propose LLM-wrapper, a method\nfor 'black-box' adaptation of VLMs for the REC task using Large Language Models\n(LLMs). LLM-wrapper capitalizes on the reasoning abilities of LLMs, improved\nwith a light fine-tuning, to select the most relevant bounding box matching the\nreferring expression, from candidates generated by a zero-shot black-box VLM.\nOur approach offers several advantages: it enables the adaptation of\nclosed-source models without needing access to their internal workings, it is\nversatile as it works with any VLM, it transfers to new VLMs and datasets, and\nit allows for the adaptation of an ensemble of VLMs. We evaluate LLM-wrapper on\nmultiple datasets using different VLMs and LLMs, demonstrating significant\nperformance improvements and highlighting the versatility of our method. While\nLLM-wrapper is not meant to directly compete with standard white-box\nfine-tuning, it offers a practical and effective alternative for black-box VLM\nadaptation. Code and checkpoints are available at\nhttps://github.com/valeoai/LLM_wrapper .\n","authors":["Amaia Cardiel","Eloi Zablocki","Elias Ramzi","Oriane Siméoni","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2409.11919v3.pdf","comment":"LLM-wrapper (v3) is published as a conference paper at ICLR 2025. (v1\n was presented at EVAL-FoMo workshop, ECCV 2024.)"},{"id":"http://arxiv.org/abs/2410.05116v2","updated":"2025-03-06T17:11:55Z","published":"2024-10-07T15:12:01Z","title":"Human-Feedback Efficient Reinforcement Learning for Online Diffusion\n Model Finetuning","summary":" Controllable generation through Stable Diffusion (SD) fine-tuning aims to\nimprove fidelity, safety, and alignment with human guidance. Existing\nreinforcement learning from human feedback methods usually rely on predefined\nheuristic reward functions or pretrained reward models built on large-scale\ndatasets, limiting their applicability to scenarios where collecting such data\nis costly or difficult. To effectively and efficiently utilize human feedback,\nwe develop a framework, HERO, which leverages online human feedback collected\non the fly during model learning. Specifically, HERO features two key\nmechanisms: (1) Feedback-Aligned Representation Learning, an online training\nmethod that captures human feedback and provides informative learning signals\nfor fine-tuning, and (2) Feedback-Guided Image Generation, which involves\ngenerating images from SD's refined initialization samples, enabling faster\nconvergence towards the evaluator's intent. We demonstrate that HERO is 4x more\nefficient in online feedback for body part anomaly correction compared to the\nbest existing method. Additionally, experiments show that HERO can effectively\nhandle tasks like reasoning, counting, personalization, and reducing NSFW\ncontent with only 0.5K online feedback.\n","authors":["Ayano Hiranaka","Shang-Fu Chen","Chieh-Hsin Lai","Dongjun Kim","Naoki Murata","Takashi Shibuya","Wei-Hsiang Liao","Shao-Hua Sun","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2410.05116v2.pdf","comment":"Published in International Conference on Learning Representations\n (ICLR) 2025"},{"id":"http://arxiv.org/abs/2503.02394v3","updated":"2025-03-06T17:10:24Z","published":"2025-03-04T08:35:01Z","title":"BHViT: Binarized Hybrid Vision Transformer","summary":" Model binarization has made significant progress in enabling real-time and\nenergy-efficient computation for convolutional neural networks (CNN), offering\na potential solution to the deployment challenges faced by Vision Transformers\n(ViTs) on edge devices. However, due to the structural differences between CNN\nand Transformer architectures, simply applying binary CNN strategies to the ViT\nmodels will lead to a significant performance drop. To tackle this challenge,\nwe propose BHViT, a binarization-friendly hybrid ViT architecture and its full\nbinarization model with the guidance of three important observations.\nInitially, BHViT utilizes the local information interaction and hierarchical\nfeature aggregation technique from coarse to fine levels to address redundant\ncomputations stemming from excessive tokens. Then, a novel module based on\nshift operations is proposed to enhance the performance of the binary\nMultilayer Perceptron (MLP) module without significantly increasing\ncomputational overhead. In addition, an innovative attention matrix\nbinarization method based on quantization decomposition is proposed to evaluate\nthe token's importance in the binarized attention matrix. Finally, we propose a\nregularization loss to address the inadequate optimization caused by the\nincompatibility between the weight oscillation in the binary layers and the\nAdam Optimizer. Extensive experimental results demonstrate that our proposed\nalgorithm achieves SOTA performance among binary ViT methods.\n","authors":["Tian Gao","Zhiyuan Zhang","Yu Zhang","Huajun Liu","Kaijie Yin","Chengzhong Xu","Hui Kong"],"pdf_url":"https://arxiv.org/pdf/2503.02394v3.pdf","comment":"Accepted by CVPR2025"},{"id":"http://arxiv.org/abs/2407.18125v3","updated":"2025-03-06T17:03:35Z","published":"2024-07-25T15:32:59Z","title":"Self-supervised pre-training with diffusion model for few-shot landmark\n detection in x-ray images","summary":" Deep neural networks have been extensively applied in the medical domain for\nvarious tasks, including image classification, segmentation, and landmark\ndetection. However, their application is often hindered by data scarcity, both\nin terms of available annotations and images. This study introduces a novel\napplication of denoising diffusion probabilistic models (DDPMs) to the landmark\ndetection task, specifically addressing the challenge of limited annotated data\nin x-ray imaging. Our key innovation lies in leveraging DDPMs for\nself-supervised pre-training in landmark detection, a previously unexplored\napproach in this domain. This method enables accurate landmark detection with\nminimal annotated training data (as few as 50 images), surpassing both ImageNet\nsupervised pre-training and traditional self-supervised techniques across three\npopular x-ray benchmark datasets. To our knowledge, this work represents the\nfirst application of diffusion models for self-supervised learning in landmark\ndetection, which may offer a valuable pre-training approach in few-shot\nregimes, for mitigating data scarcity.\n","authors":["Roberto Di Via","Francesca Odone","Vito Paolo Pastore"],"pdf_url":"https://arxiv.org/pdf/2407.18125v3.pdf","comment":"Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2503.04606v1","updated":"2025-03-06T16:53:14Z","published":"2025-03-06T16:53:14Z","title":"The Best of Both Worlds: Integrating Language Models and Diffusion\n Models for Video Generation","summary":" Recent advancements in text-to-video (T2V) generation have been driven by two\ncompeting paradigms: autoregressive language models and diffusion models.\nHowever, each paradigm has intrinsic limitations: language models struggle with\nvisual quality and error accumulation, while diffusion models lack semantic\nunderstanding and causal modeling. In this work, we propose LanDiff, a hybrid\nframework that synergizes the strengths of both paradigms through\ncoarse-to-fine generation. Our architecture introduces three key innovations:\n(1) a semantic tokenizer that compresses 3D visual features into compact 1D\ndiscrete representations through efficient semantic compression, achieving a\n$\\sim$14,000$\\times$ compression ratio; (2) a language model that generates\nsemantic tokens with high-level semantic relationships; (3) a streaming\ndiffusion model that refines coarse semantics into high-fidelity videos.\nExperiments show that LanDiff, a 5B model, achieves a score of 85.43 on the\nVBench T2V benchmark, surpassing the state-of-the-art open-source models\nHunyuan Video (13B) and other commercial models such as Sora, Keling, and\nHailuo. Furthermore, our model also achieves state-of-the-art performance in\nlong video generation, surpassing other open-source models in this field. Our\ndemo can be viewed at https://landiff.github.io/.\n","authors":["Aoxiong Yin","Kai Shen","Yichong Leng","Xu Tan","Xinyu Zhou","Juncheng Li","Siliang Tang"],"pdf_url":"https://arxiv.org/pdf/2503.04606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17494v4","updated":"2025-03-06T16:43:10Z","published":"2024-10-23T01:25:25Z","title":"Enhancing Multimodal Medical Image Classification using Cross-Graph\n Modal Contrastive Learning","summary":" The classification of medical images is a pivotal aspect of disease\ndiagnosis, often enhanced by deep learning techniques. However, traditional\napproaches typically focus on unimodal medical image data, neglecting the\nintegration of diverse non-image patient data. This paper proposes a novel\nCross-Graph Modal Contrastive Learning (CGMCL) framework for multimodal\nstructured data from different data domains to improve medical image\nclassification. The model effectively integrates both image and non-image data\nby constructing cross-modality graphs and leveraging contrastive learning to\nalign multimodal features in a shared latent space. An inter-modality feature\nscaling module further optimizes the representation learning process by\nreducing the gap between heterogeneous modalities. The proposed approach is\nevaluated on two datasets: a Parkinson's disease (PD) dataset and a public\nmelanoma dataset. Results demonstrate that CGMCL outperforms conventional\nunimodal methods in accuracy, interpretability, and early disease prediction.\nAdditionally, the method shows superior performance in multi-class melanoma\nclassification. The CGMCL framework provides valuable insights into medical\nimage classification while offering improved disease interpretability and\npredictive capabilities.\n","authors":["Jun-En Ding","Chien-Chin Hsu","Chi-Hsiang Chu","Shuqiang Wang","Feng Liu"],"pdf_url":"https://arxiv.org/pdf/2410.17494v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04592v1","updated":"2025-03-06T16:31:34Z","published":"2025-03-06T16:31:34Z","title":"A Benchmark for Multi-Lingual Vision-Language Learning in Remote Sensing\n Image Captioning","summary":" Remote Sensing Image Captioning (RSIC) is a cross-modal field bridging vision\nand language, aimed at automatically generating natural language descriptions\nof features and scenes in remote sensing imagery. Despite significant advances\nin developing sophisticated methods and large-scale datasets for training\nvision-language models (VLMs), two critical challenges persist: the scarcity of\nnon-English descriptive datasets and the lack of multilingual capability\nevaluation for models. These limitations fundamentally impede the progress and\npractical deployment of RSIC, particularly in the era of large VLMs. To address\nthese challenges, this paper presents several significant contributions to the\nfield. First, we introduce and analyze BRSIC (Bilingual Remote Sensing Image\nCaptioning), a comprehensive bilingual dataset that enriches three established\nEnglish RSIC datasets with Chinese descriptions, encompassing 13,634 images\npaired with 68,170 bilingual captions. Building upon this foundation, we\ndevelop a systematic evaluation framework that addresses the prevalent\ninconsistency in evaluation protocols, enabling rigorous assessment of model\nperformance through standardized retraining procedures on BRSIC. Furthermore,\nwe present an extensive empirical study of eight state-of-the-art large\nvision-language models (LVLMs), examining their capabilities across multiple\nparadigms including zero-shot inference, supervised fine-tuning, and\nmulti-lingual training. This comprehensive evaluation provides crucial insights\ninto the strengths and limitations of current LVLMs in handling multilingual\nremote sensing tasks. Additionally, our cross-dataset transfer experiments\nreveal interesting findings. The code and data will be available at\nhttps://github.com/mrazhou/BRSIC.\n","authors":["Qing Zhou","Tao Yang","Junyu Gao","Weiping Ni","Junzheng Wu","Qi Wang"],"pdf_url":"https://arxiv.org/pdf/2503.04592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03663v2","updated":"2025-03-06T16:25:37Z","published":"2025-03-05T16:52:34Z","title":"LION-FS: Fast & Slow Video-Language Thinker as Online Video Assistant","summary":" First-person video assistants are highly anticipated to enhance our daily\nlives through online video dialogue. However, existing online video assistants\noften sacrifice assistant efficacy for real-time efficiency by processing\nlow-frame-rate videos with coarse-grained visual features.To overcome the\ntrade-off between efficacy and efficiency, we propose \"Fast & Slow\nVideo-Language Thinker\" as an onLIne videO assistaNt, LION-FS, achieving\nreal-time, proactive, temporally accurate, and contextually precise responses.\nLION-FS adopts a two-stage optimization strategy: 1)Fast Path: Routing-Based\nResponse Determination evaluates frame-by-frame whether an immediate response\nis necessary. To enhance response determination accuracy and handle higher\nframe-rate inputs efficiently, we employ Token Aggregation Routing to\ndynamically fuse spatiotemporal features without increasing token numbers,\nwhile utilizing Token Dropping Routing to eliminate redundant features. 2)Slow\nPath: Multi-granularity Keyframe Augmentation optimizes keyframes during\nresponse generation. To provide comprehensive and detailed responses beyond\natomic actions constrained by training data, fine-grained spatial features and\nhuman-environment interaction features are extracted through multi-granular\npooling. These features are further integrated into a meticulously designed\nmultimodal Thinking Template to guide more precise response generation.\nComprehensive evaluations on online video tasks demonstrate that LION-FS\nachieves state-of-the-art efficacy and efficiency.\n","authors":["Wei Li","Bing Hu","Rui Shao","Leyang Shen","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2503.03663v2.pdf","comment":"Accept to CVPR 2025, Project page:\n https://github.com/JiuTian-VL/LION-FS"},{"id":"http://arxiv.org/abs/2503.04565v1","updated":"2025-03-06T15:53:42Z","published":"2025-03-06T15:53:42Z","title":"Omnidirectional Multi-Object Tracking","summary":" Panoramic imagery, with its 360{\\deg} field of view, offers comprehensive\ninformation to support Multi-Object Tracking (MOT) in capturing spatial and\ntemporal relationships of surrounding objects. However, most MOT algorithms are\ntailored for pinhole images with limited views, impairing their effectiveness\nin panoramic settings. Additionally, panoramic image distortions, such as\nresolution loss, geometric deformation, and uneven lighting, hinder direct\nadaptation of existing MOT methods, leading to significant performance\ndegradation. To address these challenges, we propose OmniTrack, an\nomnidirectional MOT framework that incorporates Tracklet Management to\nintroduce temporal cues, FlexiTrack Instances for object localization and\nassociation, and the CircularStatE Module to alleviate image and geometric\ndistortions. This integration enables tracking in large field-of-view\nscenarios, even under rapid sensor motion. To mitigate the lack of panoramic\nMOT datasets, we introduce the QuadTrack dataset--a comprehensive panoramic\ndataset collected by a quadruped robot, featuring diverse challenges such as\nwide fields of view, intense motion, and complex environments. Extensive\nexperiments on the public JRDB dataset and the newly introduced QuadTrack\nbenchmark demonstrate the state-of-the-art performance of the proposed\nframework. OmniTrack achieves a HOTA score of 26.92% on JRDB, representing an\nimprovement of 3.43%, and further achieves 23.45% on QuadTrack, surpassing the\nbaseline by 6.81%. The dataset and code will be made publicly available at\nhttps://github.com/xifen523/OmniTrack.\n","authors":["Kai Luo","Hao Shi","Sheng Wu","Fei Teng","Mengfei Duan","Chang Huang","Yuhang Wang","Kaiwei Wang","Kailun Yang"],"pdf_url":"https://arxiv.org/pdf/2503.04565v1.pdf","comment":"Accepted to CVPR 2025. The dataset and code will be made publicly\n available at https://github.com/xifen523/OmniTrack"},{"id":"http://arxiv.org/abs/2502.09990v2","updated":"2025-03-06T15:38:31Z","published":"2025-02-14T08:22:51Z","title":"X-Boundary: Establishing Exact Safety Boundary to Shield LLMs from\n Multi-Turn Jailbreaks without Compromising Usability","summary":" Despite the rapid development of safety alignment techniques for LLMs,\ndefending against multi-turn jailbreaks is still a challenging task. In this\npaper, we conduct a comprehensive comparison, revealing that some existing\ndefense methods can improve the robustness of LLMs against multi-turn\njailbreaks but compromise usability, i.e., reducing general capabilities or\ncausing the over-refusal problem. From the perspective of mechanism\ninterpretability of LLMs, we discover that these methods fail to establish a\nboundary that exactly distinguishes safe and harmful feature representations.\nTherefore, boundary-safe representations close to harmful representations are\ninevitably disrupted, leading to a decline in usability. To address this issue,\nwe propose X-Boundary to push harmful representations away from boundary-safe\nrepresentations and obtain an exact distinction boundary. In this way, harmful\nrepresentations can be precisely erased without disrupting safe ones.\nExperimental results show that X-Boundary achieves state-of-the-art defense\nperformance against multi-turn jailbreaks, while reducing the over-refusal rate\nby about 20% and maintaining nearly complete general capability. Furthermore,\nwe theoretically prove and empirically verify that X-Boundary can accelerate\nthe convergence process during training. Please see our code at:\nhttps://github.com/AI45Lab/X-Boundary.\n","authors":["Xiaoya Lu","Dongrui Liu","Yi Yu","Luxin Xu","Jing Shao"],"pdf_url":"https://arxiv.org/pdf/2502.09990v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04545v1","updated":"2025-03-06T15:33:19Z","published":"2025-03-06T15:33:19Z","title":"ViT-VS: On the Applicability of Pretrained Vision Transformer Features\n for Generalizable Visual Servoing","summary":" Visual servoing enables robots to precisely position their end-effector\nrelative to a target object. While classical methods rely on hand-crafted\nfeatures and thus are universally applicable without task-specific training,\nthey often struggle with occlusions and environmental variations, whereas\nlearning-based approaches improve robustness but typically require extensive\ntraining. We present a visual servoing approach that leverages pretrained\nvision transformers for semantic feature extraction, combining the advantages\nof both paradigms while also being able to generalize beyond the provided\nsample. Our approach achieves full convergence in unperturbed scenarios and\nsurpasses classical image-based visual servoing by up to 31.2\\% relative\nimprovement in perturbed scenarios. Even the convergence rates of\nlearning-based methods are matched despite requiring no task- or\nobject-specific training. Real-world evaluations confirm robust performance in\nend-effector positioning, industrial box manipulation, and grasping of unseen\nobjects using only a reference from the same category. Our code and simulation\nenvironment are available at: https://alessandroscherl.github.io/ViT-VS/\n","authors":["Alessandro Scherl","Stefan Thalhammer","Bernhard Neuberger","Wilfried Wöber","José Gracía-Rodríguez"],"pdf_url":"https://arxiv.org/pdf/2503.04545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.00299v2","updated":"2025-03-06T15:32:33Z","published":"2024-10-01T00:43:45Z","title":"GSPR: Multimodal Place Recognition Using 3D Gaussian Splatting for\n Autonomous Driving","summary":" Place recognition is a crucial component that enables autonomous vehicles to\nobtain localization results in GPS-denied environments. In recent years,\nmultimodal place recognition methods have gained increasing attention. They\novercome the weaknesses of unimodal sensor systems by leveraging complementary\ninformation from different modalities. However, most existing methods explore\ncross-modality correlations through feature-level or descriptor-level fusion,\nsuffering from a lack of interpretability. Conversely, the recently proposed 3D\nGaussian Splatting provides a new perspective on multimodal fusion by\nharmonizing different modalities into an explicit scene representation. In this\npaper, we propose a 3D Gaussian Splatting-based multimodal place recognition\nnetwork dubbed GSPR. It explicitly combines multi-view RGB images and LiDAR\npoint clouds into a spatio-temporally unified scene representation with the\nproposed Multimodal Gaussian Splatting. A network composed of 3D graph\nconvolution and transformer is designed to extract spatio-temporal features and\nglobal descriptors from the Gaussian scenes for place recognition. Extensive\nevaluations on three datasets demonstrate that our method can effectively\nleverage complementary strengths of both multi-view cameras and LiDAR,\nachieving SOTA place recognition performance while maintaining solid\ngeneralization ability. Our open-source code will be released at\nhttps://github.com/QiZS-BIT/GSPR.\n","authors":["Zhangshuo Qi","Junyi Ma","Jingyi Xu","Zijie Zhou","Luqi Cheng","Guangming Xiong"],"pdf_url":"https://arxiv.org/pdf/2410.00299v2.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2412.07775v2","updated":"2025-03-06T15:15:58Z","published":"2024-12-10T18:59:58Z","title":"Efficient Diversity-Preserving Diffusion Alignment via Gradient-Informed\n GFlowNets","summary":" While one commonly trains large diffusion models by collecting datasets on\ntarget downstream tasks, it is often desired to align and finetune pretrained\ndiffusion models with some reward functions that are either designed by experts\nor learned from small-scale datasets. Existing post-training methods for reward\nfinetuning of diffusion models typically suffer from lack of diversity in\ngenerated samples, lack of prior preservation, and/or slow convergence in\nfinetuning. Inspired by recent successes in generative flow networks\n(GFlowNets), a class of probabilistic models that sample with the unnormalized\ndensity of a reward function, we propose a novel GFlowNet method dubbed\nNabla-GFlowNet (abbreviated as \\methodname), the first GFlowNet method that\nleverages the rich signal in reward gradients, together with an objective\ncalled \\graddb plus its variant \\resgraddb designed for prior-preserving\ndiffusion finetuning. We show that our proposed method achieves fast yet\ndiversity- and prior-preserving finetuning of Stable Diffusion, a large-scale\ntext-conditioned image diffusion model, on different realistic reward\nfunctions.\n","authors":["Zhen Liu","Tim Z. Xiao","Weiyang Liu","Yoshua Bengio","Dinghuai Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.07775v2.pdf","comment":"Technical Report (35 pages, 31 figures), Accepted at ICLR 2025"},{"id":"http://arxiv.org/abs/2503.04522v1","updated":"2025-03-06T15:08:34Z","published":"2025-03-06T15:08:34Z","title":"In-Context Reverse Classification Accuracy: Efficient Estimation of\n Segmentation Quality without Ground-Truth","summary":" Assessing the quality of automatic image segmentation is crucial in clinical\npractice, but often very challenging due to the limited availability of ground\ntruth annotations. In this paper, we introduce In-Context Reverse\nClassification Accuracy (In-Context RCA), a novel framework for automatically\nestimating segmentation quality in the absence of ground-truth annotations. By\nleveraging recent in-context learning segmentation models and incorporating\nretrieval-augmentation techniques to select the most relevant reference images,\nour approach enables efficient quality estimation with minimal reference data.\nValidated across diverse medical imaging modalities, our method demonstrates\nrobust performance and computational efficiency, offering a promising solution\nfor automated quality control in clinical workflows, where fast and reliable\nsegmentation assessment is essential. The code is available at\nhttps://github.com/mcosarinsky/In-Context-RCA.\n","authors":["Matias Cosarinsky","Ramiro Billot","Lucas Mansilla","Gabriel Gimenez","Nicolas Gaggión","Guanghui Fu","Enzo Ferrante"],"pdf_url":"https://arxiv.org/pdf/2503.04522v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.05874v2","updated":"2025-03-06T15:02:33Z","published":"2025-02-09T12:23:40Z","title":"MMGDreamer: Mixed-Modality Graph for Geometry-Controllable 3D Indoor\n Scene Generation","summary":" Controllable 3D scene generation has extensive applications in virtual\nreality and interior design, where the generated scenes should exhibit high\nlevels of realism and controllability in terms of geometry. Scene graphs\nprovide a suitable data representation that facilitates these applications.\nHowever, current graph-based methods for scene generation are constrained to\ntext-based inputs and exhibit insufficient adaptability to flexible user\ninputs, hindering the ability to precisely control object geometry. To address\nthis issue, we propose MMGDreamer, a dual-branch diffusion model for scene\ngeneration that incorporates a novel Mixed-Modality Graph, visual enhancement\nmodule, and relation predictor. The mixed-modality graph allows object nodes to\nintegrate textual and visual modalities, with optional relationships between\nnodes. It enhances adaptability to flexible user inputs and enables meticulous\ncontrol over the geometry of objects in the generated scenes. The visual\nenhancement module enriches the visual fidelity of text-only nodes by\nconstructing visual representations using text embeddings. Furthermore, our\nrelation predictor leverages node representations to infer absent relationships\nbetween nodes, resulting in more coherent scene layouts. Extensive experimental\nresults demonstrate that MMGDreamer exhibits superior control of object\ngeometry, achieving state-of-the-art scene generation performance. Project\npage: https://yangzhifeio.github.io/project/MMGDreamer.\n","authors":["Zhifei Yang","Keyang Lu","Chao Zhang","Jiaxing Qi","Hanqi Jiang","Ruifei Ma","Shenglin Yin","Yifan Xu","Mingzhe Xing","Zhen Xiao","Jieyi Long","Xiangde Liu","Guangyao Zhai"],"pdf_url":"https://arxiv.org/pdf/2502.05874v2.pdf","comment":"Accepted by AAAI 2025 Main Track"},{"id":"http://arxiv.org/abs/2503.04513v1","updated":"2025-03-06T14:59:38Z","published":"2025-03-06T14:59:38Z","title":"A Novel Solution for Drone Photogrammetry with Low-overlap Aerial Images\n using Monocular Depth Estimation","summary":" Low-overlap aerial imagery poses significant challenges to traditional\nphotogrammetric methods, which rely heavily on high image overlap to produce\naccurate and complete mapping products. In this study, we propose a novel\nworkflow based on monocular depth estimation to address the limitations of\nconventional techniques. Our method leverages tie points obtained from aerial\ntriangulation to establish a relationship between monocular depth and metric\ndepth, thus transforming the original depth map into a metric depth map,\nenabling the generation of dense depth information and the comprehensive\nreconstruction of the scene. For the experiments, a high-overlap drone dataset\ncontaining 296 images is processed using Metashape to generate depth maps and\nDSMs as ground truth. Subsequently, we create a low-overlap dataset by\nselecting 20 images for experimental evaluation. Results demonstrate that while\nthe recovered depth maps and resulting DSMs achieve meter-level accuracy, they\nprovide significantly better completeness compared to traditional methods,\nparticularly in regions covered by single images. This study showcases the\npotential of monocular depth estimation in low-overlap aerial photogrammetry.\n","authors":["Jiageng Zhong","Qi Zhou","Ming Li","Armin Gruen","Xuan Liao"],"pdf_url":"https://arxiv.org/pdf/2503.04513v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04504v1","updated":"2025-03-06T14:52:34Z","published":"2025-03-06T14:52:34Z","title":"AnyAnomaly: Zero-Shot Customizable Video Anomaly Detection with LVLM","summary":" Video anomaly detection (VAD) is crucial for video analysis and surveillance\nin computer vision. However, existing VAD models rely on learned normal\npatterns, which makes them difficult to apply to diverse environments.\nConsequently, users should retrain models or develop separate AI models for new\nenvironments, which requires expertise in machine learning, high-performance\nhardware, and extensive data collection, limiting the practical usability of\nVAD. To address these challenges, this study proposes customizable video\nanomaly detection (C-VAD) technique and the AnyAnomaly model. C-VAD considers\nuser-defined text as an abnormal event and detects frames containing a\nspecified event in a video. We effectively implemented AnyAnomaly using a\ncontext-aware visual question answering without fine-tuning the large vision\nlanguage model. To validate the effectiveness of the proposed model, we\nconstructed C-VAD datasets and demonstrated the superiority of AnyAnomaly.\nFurthermore, our approach showed competitive performance on VAD benchmark\ndatasets, achieving state-of-the-art results on the UBnormal dataset and\noutperforming other methods in generalization across all datasets. Our code is\navailable online at github.com/SkiddieAhn/Paper-AnyAnomaly.\n","authors":["Sunghyun Ahn","Youngwan Jo","Kijung Lee","Sein Kwon","Inpyo Hong","Sanghyun Park"],"pdf_url":"https://arxiv.org/pdf/2503.04504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.01262v2","updated":"2025-03-06T14:50:58Z","published":"2025-02-03T11:36:01Z","title":"FSPGD: Rethinking Black-box Attacks on Semantic Segmentation","summary":" Transferability, the ability of adversarial examples crafted for one model to\ndeceive other models, is crucial for black-box attacks. Despite advancements in\nattack methods for semantic segmentation, transferability remains limited,\nreducing their effectiveness in real-world applications. To address this, we\nintroduce the Feature Similarity Projected Gradient Descent (FSPGD) attack, a\nnovel black-box approach that enhances both attack performance and\ntransferability. Unlike conventional segmentation attacks that rely on output\npredictions for gradient calculation, FSPGD computes gradients from\nintermediate layer features. Specifically, our method introduces a loss\nfunction that targets local information by comparing features between clean\nimages and adversarial examples, while also disrupting contextual information\nby accounting for spatial relationships between objects. Experiments on Pascal\nVOC 2012 and Cityscapes datasets demonstrate that FSPGD achieves superior\ntransferability and attack performance, establishing a new state-of-the-art\nbenchmark. Code is available at https://github.com/KU-AIVS/FSPGD.\n","authors":["Eun-Sol Park","MiSo Park","Seung Park","Yong-Goo Shin"],"pdf_url":"https://arxiv.org/pdf/2502.01262v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04501v1","updated":"2025-03-06T14:50:17Z","published":"2025-03-06T14:50:17Z","title":"IMFine: 3D Inpainting via Geometry-guided Multi-view Refinement","summary":" Current 3D inpainting and object removal methods are largely limited to\nfront-facing scenes, facing substantial challenges when applied to diverse,\n\"unconstrained\" scenes where the camera orientation and trajectory are\nunrestricted. To bridge this gap, we introduce a novel approach that produces\ninpainted 3D scenes with consistent visual quality and coherent underlying\ngeometry across both front-facing and unconstrained scenes. Specifically, we\npropose a robust 3D inpainting pipeline that incorporates geometric priors and\na multi-view refinement network trained via test-time adaptation, building on a\npre-trained image inpainting model. Additionally, we develop a novel inpainting\nmask detection technique to derive targeted inpainting masks from object masks,\nboosting the performance in handling unconstrained scenes. To validate the\nefficacy of our approach, we create a challenging and diverse benchmark that\nspans a wide range of scenes. Comprehensive experiments demonstrate that our\nproposed method substantially outperforms existing state-of-the-art approaches.\n","authors":["Zhihao Shi","Dong Huo","Yuhongze Zhou","Kejia Yin","Yan Min","Juwei Lu","Xinxin Zuo"],"pdf_url":"https://arxiv.org/pdf/2503.04501v1.pdf","comment":"Accepted at CVPR 2025,\n \\href{https://xinxinzuo2353.github.io/imfine/}{Project Page}"},{"id":"http://arxiv.org/abs/2503.04500v1","updated":"2025-03-06T14:49:28Z","published":"2025-03-06T14:49:28Z","title":"ReynoldsFlow: Exquisite Flow Estimation via Reynolds Transport Theorem","summary":" Optical flow is a fundamental technique for motion estimation, widely applied\nin video stabilization, interpolation, and object tracking. Recent advancements\nin artificial intelligence (AI) have enabled deep learning models to leverage\noptical flow as an important feature for motion analysis. However, traditional\noptical flow methods rely on restrictive assumptions, such as brightness\nconstancy and slow motion constraints, limiting their effectiveness in complex\nscenes. Deep learning-based approaches require extensive training on large\ndomain-specific datasets, making them computationally demanding. Furthermore,\noptical flow is typically visualized in the HSV color space, which introduces\nnonlinear distortions when converted to RGB and is highly sensitive to noise,\ndegrading motion representation accuracy. These limitations inherently\nconstrain the performance of downstream models, potentially hindering object\ntracking and motion analysis tasks. To address these challenges, we propose\nReynolds flow, a novel training-free flow estimation inspired by the Reynolds\ntransport theorem, offering a principled approach to modeling complex motion\ndynamics. Beyond the conventional HSV-based visualization, denoted\nReynoldsFlow, we introduce an alternative representation, ReynoldsFlow+,\ndesigned to improve flow visualization. We evaluate ReynoldsFlow and\nReynoldsFlow+ across three video-based benchmarks: tiny object detection on\nUAVDB, infrared object detection on Anti-UAV, and pose estimation on GolfDB.\nExperimental results demonstrate that networks trained with ReynoldsFlow+\nachieve state-of-the-art (SOTA) performance, exhibiting improved robustness and\nefficiency across all tasks.\n","authors":["Yu-Hsi Chen","Chin-Tien Wu"],"pdf_url":"https://arxiv.org/pdf/2503.04500v1.pdf","comment":"10 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2503.04499v1","updated":"2025-03-06T14:48:25Z","published":"2025-03-06T14:48:25Z","title":"Spatial regularisation for improved accuracy and interpretability in\n keypoint-based registration","summary":" Unsupervised registration strategies bypass requirements in ground truth\ntransforms or segmentations by optimising similarity metrics between fixed and\nmoved volumes. Among these methods, a recent subclass of approaches based on\nunsupervised keypoint detection stand out as very promising for\ninterpretability. Specifically, these methods train a network to predict\nfeature maps for fixed and moving images, from which explainable centres of\nmass are computed to obtain point clouds, that are then aligned in closed-form.\nHowever, the features returned by the network often yield spatially diffuse\npatterns that are hard to interpret, thus undermining the purpose of\nkeypoint-based registration. Here, we propose a three-fold loss to regularise\nthe spatial distribution of the features. First, we use the KL divergence to\nmodel features as point spread functions that we interpret as probabilistic\nkeypoints. Then, we sharpen the spatial distributions of these features to\nincrease the precision of the detected landmarks. Finally, we introduce a new\nrepulsive loss across keypoints to encourage spatial diversity. Overall, our\nloss considerably improves the interpretability of the features, which now\ncorrespond to precise and anatomically meaningful landmarks. We demonstrate our\nthree-fold loss in foetal rigid motion tracking and brain MRI affine\nregistration tasks, where it not only outperforms state-of-the-art unsupervised\nstrategies, but also bridges the gap with state-of-the-art supervised methods.\nOur code is available at https://github.com/BenBillot/spatial_regularisation.\n","authors":["Benjamin Billot","Ramya Muthukrishnan","Esra Abaci-Turk","Ellen P. Grant","Nicholas Ayache","Hervé Delingette","Polina Golland"],"pdf_url":"https://arxiv.org/pdf/2503.04499v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2503.04496v1","updated":"2025-03-06T14:44:25Z","published":"2025-03-06T14:44:25Z","title":"Learning Object Placement Programs for Indoor Scene Synthesis with\n Iterative Self Training","summary":" Data driven and autoregressive indoor scene synthesis systems generate indoor\nscenes automatically by suggesting and then placing objects one at a time.\nEmpirical observations show that current systems tend to produce incomplete\nnext object location distributions. We introduce a system which addresses this\nproblem. We design a Domain Specific Language (DSL) that specifies functional\nconstraints. Programs from our language take as input a partial scene and\nobject to place. Upon execution they predict possible object placements. We\ndesign a generative model which writes these programs automatically. Available\n3D scene datasets do not contain programs to train on, so we build upon\nprevious work in unsupervised program induction to introduce a new program\nbootstrapping algorithm. In order to quantify our empirical observations we\nintroduce a new evaluation procedure which captures how well a system models\nper-object location distributions. We ask human annotators to label all the\npossible places an object can go in a scene and show that our system produces\nper-object location distributions more consistent with human annotators. Our\nsystem also generates indoor scenes of comparable quality to previous systems\nand while previous systems degrade in performance when training data is sparse,\nour system does not degrade to the same degree.\n","authors":["Adrian Chang","Kai Wang","Yuanbo Li","Manolis Savva","Angel X. Chang","Daniel Ritchie"],"pdf_url":"https://arxiv.org/pdf/2503.04496v1.pdf","comment":"21 pages, 20 figures Subjects: Graphics (cs.GR), Computer Vision and\n Pattern Recognition (cs.CV), Machine Learning (cs.LG)"},{"id":"http://arxiv.org/abs/2412.04842v3","updated":"2025-03-06T14:40:15Z","published":"2024-12-06T08:27:53Z","title":"UniMLVG: Unified Framework for Multi-view Long Video Generation with\n Comprehensive Control Capabilities for Autonomous Driving","summary":" The creation of diverse and realistic driving scenarios has become essential\nto enhance perception and planning capabilities of the autonomous driving\nsystem. However, generating long-duration, surround-view consistent driving\nvideos remains a significant challenge. To address this, we present UniMLVG, a\nunified framework designed to generate extended street multi-perspective videos\nunder precise control. By integrating single- and multi-view driving videos\ninto the training data, our approach updates a DiT-based diffusion model\nequipped with cross-frame and cross-view modules across three stages with multi\ntraining objectives, substantially boosting the diversity and quality of\ngenerated visual content. Importantly, we propose an innovative explicit\nviewpoint modeling approach for multi-view video generation to effectively\nimprove motion transition consistency. Capable of handling various input\nreference formats (e.g., text, images, or video), our UniMLVG generates\nhigh-quality multi-view videos according to the corresponding condition\nconstraints such as 3D bounding boxes or frame-level text descriptions.\nCompared to the best models with similar capabilities, our framework achieves\nimprovements of 48.2% in FID and 35.2% in FVD.\n","authors":["Rui Chen","Zehuan Wu","Yichen Liu","Yuxin Guo","Jingcheng Ni","Haifeng Xia","Siyu Xia"],"pdf_url":"https://arxiv.org/pdf/2412.04842v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03222v2","updated":"2025-03-06T14:32:49Z","published":"2025-03-05T06:32:49Z","title":"Mocap-2-to-3: Lifting 2D Diffusion-Based Pretrained Models for 3D Motion\n Capture","summary":" Recovering absolute poses in the world coordinate system from monocular views\npresents significant challenges. Two primary issues arise in this context.\nFirstly, existing methods rely on 3D motion data for training, which requires\ncollection in limited environments. Acquiring such 3D labels for new actions in\na timely manner is impractical, severely restricting the model's generalization\ncapabilities. In contrast, 2D poses are far more accessible and easier to\nobtain. Secondly, estimating a person's absolute position in metric space from\na single viewpoint is inherently more complex. To address these challenges, we\nintroduce Mocap-2-to-3, a novel framework that decomposes intricate 3D motions\ninto 2D poses, leveraging 2D data to enhance 3D motion reconstruction in\ndiverse scenarios and accurately predict absolute positions in the world\ncoordinate system. We initially pretrain a single-view diffusion model with\nextensive 2D data, followed by fine-tuning a multi-view diffusion model for\nview consistency using publicly available 3D data. This strategy facilitates\nthe effective use of large-scale 2D data. Additionally, we propose an\ninnovative human motion representation that decouples local actions from global\nmovements and encodes geometric priors of the ground, ensuring the generative\nmodel learns accurate motion priors from 2D data. During inference, this allows\nfor the gradual recovery of global movements, resulting in more plausible\npositioning. We evaluate our model's performance on real-world datasets,\ndemonstrating superior accuracy in motion and absolute human positioning\ncompared to state-of-the-art methods, along with enhanced generalization and\nscalability. Our code will be made publicly available.\n","authors":["Zhumei Wang","Zechen Hu","Ruoxi Guo","Huaijin Pi","Ziyong Feng","Sida Peng","Xiaowei Zhou"],"pdf_url":"https://arxiv.org/pdf/2503.03222v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04478v1","updated":"2025-03-06T14:28:17Z","published":"2025-03-06T14:28:17Z","title":"Semantic Alignment of Unimodal Medical Text and Vision Representations","summary":" General-purpose AI models, particularly those designed for text and vision,\ndemonstrate impressive versatility across a wide range of deep-learning tasks.\nHowever, they often underperform in specialised domains like medical imaging,\nwhere domain-specific solutions or alternative knowledge transfer approaches\nare typically required. Recent studies have noted that general-purpose models\ncan exhibit similar latent spaces when processing semantically related data,\nalthough this alignment does not occur naturally. Building on this insight, it\nhas been shown that applying a simple transformation - at most affine -\nestimated from a subset of semantically corresponding samples, known as\nanchors, enables model stitching across diverse training paradigms,\narchitectures, and modalities. In this paper, we explore how semantic alignment\n- estimating transformations between anchors - can bridge general-purpose AI\nwith specialised medical knowledge. Using multiple public chest X-ray datasets,\nwe demonstrate that model stitching across model architectures allows general\nmodels to integrate domain-specific knowledge without additional training,\nleading to improved performance on medical tasks. Furthermore, we introduce a\nnovel zero-shot classification approach for unimodal vision encoders that\nleverages semantic alignment across modalities. Our results show that our\nmethod not only outperforms general multimodal models but also approaches the\nperformance levels of fully trained, medical-specific multimodal solutions\n","authors":["Maxime Di Folco","Emily Chan","Marta Hasny","Cosmin I. Bercea","Julia A. Schnabel"],"pdf_url":"https://arxiv.org/pdf/2503.04478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.13524v4","updated":"2025-03-06T14:27:12Z","published":"2025-02-19T08:21:59Z","title":"MobileViM: A Light-weight and Dimension-independent Vision Mamba for 3D\n Medical Image Analysis","summary":" Efficient evaluation of three-dimensional (3D) medical images is crucial for\ndiagnostic and therapeutic practices in healthcare. Recent years have seen a\nsubstantial uptake in applying deep learning and computer vision to analyse and\ninterpret medical images. Traditional approaches, such as convolutional neural\nnetworks (CNNs) and vision transformers (ViTs), face significant computational\nchallenges, prompting the need for architectural advancements. Recent efforts\nhave led to the introduction of novel architectures like the ``Mamba'' model as\nalternative solutions to traditional CNNs or ViTs. The Mamba model excels in\nthe linear processing of one-dimensional data with low computational demands.\nHowever, Mamba's potential for 3D medical image analysis remains underexplored\nand could face significant computational challenges as the dimension increases.\nThis manuscript presents MobileViM, a streamlined architecture for efficient\nsegmentation of 3D medical images. In the MobileViM network, we invent a new\ndimension-independent mechanism and a dual-direction traversing approach to\nincorporate with a vision-Mamba-based framework. MobileViM also features a\ncross-scale bridging technique to improve efficiency and accuracy across\nvarious medical imaging modalities. With these enhancements, MobileViM achieves\nsegmentation speeds exceeding 90 frames per second (FPS) on a single graphics\nprocessing unit (i.e., NVIDIA RTX 4090). This performance is over 24 FPS faster\nthan the state-of-the-art deep learning models for processing 3D images with\nthe same computational resources. In addition, experimental evaluations\ndemonstrate that MobileViM delivers superior performance, with Dice similarity\nscores reaching 92.72%, 86.69%, 80.46%, and 77.43% for PENGWIN, BraTS2024,\nATLAS, and Toothfairy2 datasets, respectively, which significantly surpasses\nexisting models.\n","authors":["Wei Dai","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2502.13524v4.pdf","comment":"The corresponding author disagrees with the manuscript submitted to\n arXiv"},{"id":"http://arxiv.org/abs/2503.04475v1","updated":"2025-03-06T14:24:22Z","published":"2025-03-06T14:24:22Z","title":"ForestLPR: LiDAR Place Recognition in Forests Attentioning Multiple BEV\n Density Images","summary":" Place recognition is essential to maintain global consistency in large-scale\nlocalization systems. While research in urban environments has progressed\nsignificantly using LiDARs or cameras, applications in natural forest-like\nenvironments remain largely under-explored. Furthermore, forests present\nparticular challenges due to high self-similarity and substantial variations in\nvegetation growth over time. In this work, we propose a robust LiDAR-based\nplace recognition method for natural forests, ForestLPR. We hypothesize that a\nset of cross-sectional images of the forest's geometry at different heights\ncontains the information needed to recognize revisiting a place. The\ncross-sectional images are represented by \\ac{bev} density images of horizontal\nslices of the point cloud at different heights. Our approach utilizes a visual\ntransformer as the shared backbone to produce sets of local descriptors and\nintroduces a multi-BEV interaction module to attend to information at different\nheights adaptively. It is followed by an aggregation layer that produces a\nrotation-invariant place descriptor. We evaluated the efficacy of our method\nextensively on real-world data from public benchmarks as well as robotic\ndatasets and compared it against the state-of-the-art (SOTA) methods. The\nresults indicate that ForestLPR has consistently good performance on all\nevaluations and achieves an average increase of 7.38\\% and 9.11\\% on Recall@1\nover the closest competitor on intra-sequence loop closure detection and\ninter-sequence re-localization, respectively, validating our hypothesis\n","authors":["Yanqing Shen","Turcan Tuna","Marco Hutter","Cesar Cadena","Nanning Zheng"],"pdf_url":"https://arxiv.org/pdf/2503.04475v1.pdf","comment":"accepted by CVPR2025"},{"id":"http://arxiv.org/abs/2503.04470v1","updated":"2025-03-06T14:21:43Z","published":"2025-03-06T14:21:43Z","title":"Gate-Shift-Pose: Enhancing Action Recognition in Sports with Skeleton\n Information","summary":" This paper introduces Gate-Shift-Pose, an enhanced version of Gate-Shift-Fuse\nnetworks, designed for athlete fall classification in figure skating by\nintegrating skeleton pose data alongside RGB frames. We evaluate two fusion\nstrategies: early-fusion, which combines RGB frames with Gaussian heatmaps of\npose keypoints at the input stage, and late-fusion, which employs a\nmulti-stream architecture with attention mechanisms to combine RGB and pose\nfeatures. Experiments on the FR-FS dataset demonstrate that Gate-Shift-Pose\nsignificantly outperforms the RGB-only baseline, improving accuracy by up to\n40% with ResNet18 and 20% with ResNet50. Early-fusion achieves the highest\naccuracy (98.08%) with ResNet50, leveraging the model's capacity for effective\nmultimodal integration, while late-fusion is better suited for lighter\nbackbones like ResNet18. These results highlight the potential of multimodal\narchitectures for sports action recognition and the critical role of skeleton\npose information in capturing complex motion patterns.\n","authors":["Edoardo Bianchi","Oswald Lanz"],"pdf_url":"https://arxiv.org/pdf/2503.04470v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04459v1","updated":"2025-03-06T14:11:46Z","published":"2025-03-06T14:11:46Z","title":"Question-Aware Gaussian Experts for Audio-Visual Question Answering","summary":" Audio-Visual Question Answering (AVQA) requires not only question-based\nmultimodal reasoning but also precise temporal grounding to capture subtle\ndynamics for accurate prediction. However, existing methods mainly use question\ninformation implicitly, limiting focus on question-specific details.\nFurthermore, most studies rely on uniform frame sampling, which can miss key\nquestion-relevant frames. Although recent Top-K frame selection methods aim to\naddress this, their discrete nature still overlooks fine-grained temporal\ndetails. This paper proposes \\textbf{QA-TIGER}, a novel framework that\nexplicitly incorporates question information and models continuous temporal\ndynamics. Our key idea is to use Gaussian-based modeling to adaptively focus on\nboth consecutive and non-consecutive frames based on the question, while\nexplicitly injecting question information and applying progressive refinement.\nWe leverage a Mixture of Experts (MoE) to flexibly implement multiple Gaussian\nmodels, activating temporal experts specifically tailored to the question.\nExtensive experiments on multiple AVQA benchmarks show that QA-TIGER\nconsistently achieves state-of-the-art performance. Code is available at\nhttps://github.com/AIM-SKKU/QA-TIGER\n","authors":["Hongyeob Kim","Inyoung Jung","Dayoon Suh","Youjia Zhang","Sangmin Lee","Sungeun Hong"],"pdf_url":"https://arxiv.org/pdf/2503.04459v1.pdf","comment":"CVPR 2025. Project page at https://aim-skku.github.io/QA-TIGER/"},{"id":"http://arxiv.org/abs/2503.04457v1","updated":"2025-03-06T14:11:00Z","published":"2025-03-06T14:11:00Z","title":"TPC: Cross-Temporal Prediction Connection for Vision-Language Model\n Hallucination Reduction","summary":" Vision-language models (VLMs) have achieved remarkable advancements,\ncapitalizing on the impressive capabilities of large language models (LLMs)\nacross diverse tasks. Despite this, a critical challenge known as hallucination\noccurs when models overconfidently describe objects or attributes absent from\nthe image, a problem exacerbated by the tendency of VLMs to rely on linguistic\npriors. This limitation reduces model reliability in high-stakes applications.\nIn this work, we have observed the characteristic of logits' continuity\nconsistency enhancement and introduced a straightforward and efficient method,\nCross-Temporal Prediction Connection (TPC), designed to enhance the semantic\nconsistency of logits by connecting them temporally across timesteps. TPC\namplifies information flow and improves coherence, effectively reducing\nhallucination. Extensive experiments show that TPC surpasses existing\nrepresentatives, delivering superior performance in both accuracy and\nefficiency while maintaining robustness in open-ended text generation tasks.\n","authors":["Chao Wang","Weiwei Fu","Yang Zhou"],"pdf_url":"https://arxiv.org/pdf/2503.04457v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04452v1","updated":"2025-03-06T14:06:35Z","published":"2025-03-06T14:06:35Z","title":"A lightweight model FDM-YOLO for small target improvement based on\n YOLOv8","summary":" Small targets are particularly difficult to detect due to their low pixel\ncount, complex backgrounds, and varying shooting angles, which make it hard for\nmodels to extract effective features. While some large-scale models offer high\naccuracy, their long inference times make them unsuitable for real-time\ndeployment on edge devices. On the other hand, models designed for low\ncomputational power often suffer from poor detection accuracy. This paper\nfocuses on small target detection and explores methods for object detection\nunder low computational constraints. Building on the YOLOv8 model, we propose a\nnew network architecture called FDM-YOLO. Our research includes the following\nkey contributions: We introduce FDM-YOLO by analyzing the output of the YOLOv8\ndetection head. We add a highresolution layer and remove the large target\ndetection layer to better handle small targets. Based on PConv, we propose a\nlightweight network structure called Fast-C2f, which is integrated into the PAN\nmodule of the model. To mitigate the accuracy loss caused by model\nlightweighting, we employ dynamic upsampling (Dysample) and a lightweight EMA\nattention mechanism.The FDM-YOLO model was validated on the Visdrone dataset,\nachieving a 38% reduction in parameter count and improving the Map0.5 score\nfrom 38.4% to 42.5%, all while maintaining nearly the same inference speed.\nThis demonstrates the effectiveness of our approach in balancing accuracy and\nefficiency for edge device deployment.\n","authors":["Xuerui Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.04452v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04484v3","updated":"2025-03-06T14:06:24Z","published":"2023-12-07T17:59:53Z","title":"FRNet: Frustum-Range Networks for Scalable LiDAR Segmentation","summary":" LiDAR segmentation has become a crucial component of advanced autonomous\ndriving systems. Recent range-view LiDAR segmentation approaches show promise\nfor real-time processing. However, they inevitably suffer from corrupted\ncontextual information and rely heavily on post-processing techniques for\nprediction refinement. In this work, we propose FRNet, a simple yet powerful\nmethod aimed at restoring the contextual information of range image pixels\nusing corresponding frustum LiDAR points. First, a frustum feature encoder\nmodule is used to extract per-point features within the frustum region, which\npreserves scene consistency and is critical for point-level predictions. Next,\na frustum-point fusion module is introduced to update per-point features\nhierarchically, enabling each point to extract more surrounding information\nthrough the frustum features. Finally, a head fusion module is used to fuse\nfeatures at different levels for final semantic predictions. Extensive\nexperiments conducted on four popular LiDAR segmentation benchmarks under\nvarious task setups demonstrate the superiority of FRNet. Notably, FRNet\nachieves 73.3% and 82.5% mIoU scores on the testing sets of SemanticKITTI and\nnuScenes. While achieving competitive performance, FRNet operates 5 times\nfaster than state-of-the-art approaches. Such high efficiency opens up new\npossibilities for more scalable LiDAR segmentation. The code has been made\npublicly available at https://github.com/Xiangxu-0103/FRNet.\n","authors":["Xiang Xu","Lingdong Kong","Hui Shuai","Qingshan Liu"],"pdf_url":"https://arxiv.org/pdf/2312.04484v3.pdf","comment":"TIP 2025; 18 pages, 11 figures, 14 tables; Code at\n https://github.com/Xiangxu-0103/FRNet"},{"id":"http://arxiv.org/abs/2407.13304v3","updated":"2025-03-06T14:06:01Z","published":"2024-07-18T09:07:23Z","title":"A Dataset and Benchmark for Shape Completion of Fruits for Agricultural\n Robotics","summary":" As the world population is expected to reach 10 billion by 2050, our\nagricultural production system needs to double its productivity despite a\ndecline of human workforce in the agricultural sector. Autonomous robotic\nsystems are one promising pathway to increase productivity by taking over\nlabor-intensive manual tasks like fruit picking. To be effective, such systems\nneed to monitor and interact with plants and fruits precisely, which is\nchallenging due to the cluttered nature of agricultural environments causing,\nfor example, strong occlusions. Thus, being able to estimate the complete 3D\nshapes of objects in presence of occlusions is crucial for automating\noperations such as fruit harvesting. In this paper, we propose the first\npublicly available 3D shape completion dataset for agricultural vision systems.\nWe provide an RGB-D dataset for estimating the 3D shape of fruits.\nSpecifically, our dataset contains RGB-D frames of single sweet peppers in lab\nconditions but also in a commercial greenhouse. For each fruit, we additionally\ncollected high-precision point clouds that we use as ground truth. For\nacquiring the ground truth shape, we developed a measuring process that allows\nus to record data of real sweet pepper plants, both in the lab and in the\ngreenhouse with high precision, and determine the shape of the sensed fruits.\nWe release our dataset, consisting of almost 7,000 RGB-D frames belonging to\nmore than 100 different fruits. We provide segmented RGB-D frames, with camera\nintrinsics to easily obtain colored point clouds, together with the\ncorresponding high-precision, occlusion-free point clouds obtained with a\nhigh-precision laser scanner. We additionally enable evaluation of shape\ncompletion approaches on a hidden test set through a public challenge on a\nbenchmark server.\n","authors":["Federico Magistri","Thomas Läbe","Elias Marks","Sumanth Nagulavancha","Yue Pan","Claus Smitt","Lasse Klingbeil","Michael Halstead","Heiner Kuhlmann","Chris McCool","Jens Behley","Cyrill Stachniss"],"pdf_url":"https://arxiv.org/pdf/2407.13304v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04444v1","updated":"2025-03-06T14:00:59Z","published":"2025-03-06T14:00:59Z","title":"ToFu: Visual Tokens Reduction via Fusion for Multi-modal, Multi-patch,\n Multi-image Task","summary":" Large Multimodal Models (LMMs) are powerful tools that are capable of\nreasoning and understanding multimodal information beyond text and language.\nDespite their entrenched impact, the development of LMMs is hindered by the\nhigher computational requirements compared to their unimodal counterparts. One\nof the main causes of this is the large amount of tokens needed to encode the\nvisual input, which is especially evident for multi-image multimodal tasks.\nRecent approaches to reduce visual tokens depend on the visual encoder\narchitecture, require fine-tuning the LLM to maintain the performance, and only\nconsider single-image scenarios. To address these limitations, we propose ToFu,\na visual encoder-agnostic, training-free Token Fusion strategy that combines\nredundant visual tokens of LMMs for high-resolution, multi-image, tasks. The\ncore intuition behind our method is straightforward yet effective: preserve\ndistinctive tokens while combining similar ones. We achieve this by\nsequentially examining visual tokens and deciding whether to merge them with\nothers or keep them as separate entities. We validate our approach on the\nwell-established LLaVA-Interleave Bench, which covers challenging multi-image\ntasks. In addition, we push to the extreme our method by testing it on a\nnewly-created benchmark, ComPairs, focused on multi-image comparisons where a\nlarger amount of images and visual tokens are inputted to the LMMs. Our\nextensive analysis, considering several LMM architectures, demonstrates the\nbenefits of our approach both in terms of efficiency and performance gain.\n","authors":["Vittorio Pippi","Matthieu Guillaumin","Silvia Cascianelli","Rita Cucchiara","Maximilian Jaritz","Loris Bazzani"],"pdf_url":"https://arxiv.org/pdf/2503.04444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04441v1","updated":"2025-03-06T13:56:48Z","published":"2025-03-06T13:56:48Z","title":"EvidMTL: Evidential Multi-Task Learning for Uncertainty-Aware Semantic\n Surface Mapping from Monocular RGB Images","summary":" For scene understanding in unstructured environments, an accurate and\nuncertainty-aware metric-semantic mapping is required to enable informed action\nselection by autonomous systems.Existing mapping methods often suffer from\noverconfident semantic predictions, and sparse and noisy depth sensing, leading\nto inconsistent map representations. In this paper, we therefore introduce\nEvidMTL, a multi-task learning framework that uses evidential heads for depth\nestimation and semantic segmentation, enabling uncertainty-aware inference from\nmonocular RGB images. To enable uncertainty-calibrated evidential multi-task\nlearning, we propose a novel evidential depth loss function that jointly\noptimizes the belief strength of the depth prediction in conjunction with\nevidential segmentation loss. Building on this, we present EvidKimera, an\nuncertainty-aware semantic surface mapping framework, which uses evidential\ndepth and semantics prediction for improved 3D metric-semantic consistency. We\ntrain and evaluate EvidMTL on the NYUDepthV2 and assess its zero-shot\nperformance on ScanNetV2, demonstrating superior uncertainty estimation\ncompared to conventional approaches while maintaining comparable depth\nestimation and semantic segmentation. In zero-shot mapping tests on ScanNetV2,\nEvidKimera outperforms Kimera in semantic surface mapping accuracy and\nconsistency, highlighting the benefits of uncertainty-aware mapping and\nunderscoring its potential for real-world robotic applications.\n","authors":["Rohit Menon","Nils Dengler","Sicong Pan","Gokul Krishna Chenchani","Maren Bennewitz"],"pdf_url":"https://arxiv.org/pdf/2503.04441v1.pdf","comment":"Submitted to IROS 2025 Conference"},{"id":"http://arxiv.org/abs/2503.03272v2","updated":"2025-03-06T13:49:46Z","published":"2025-03-05T08:52:55Z","title":"Towards Effective and Sparse Adversarial Attack on Spiking Neural\n Networks via Breaking Invisible Surrogate Gradients","summary":" Spiking neural networks (SNNs) have shown their competence in handling\nspatial-temporal event-based data with low energy consumption. Similar to\nconventional artificial neural networks (ANNs), SNNs are also vulnerable to\ngradient-based adversarial attacks, wherein gradients are calculated by\nspatial-temporal back-propagation (STBP) and surrogate gradients (SGs).\nHowever, the SGs may be invisible for an inference-only model as they do not\ninfluence the inference results, and current gradient-based attacks are\nineffective for binary dynamic images captured by the dynamic vision sensor\n(DVS). While some approaches addressed the issue of invisible SGs through\nuniversal SGs, their SGs lack a correlation with the victim model, resulting in\nsub-optimal performance. Moreover, the imperceptibility of existing SNN-based\nbinary attacks is still insufficient. In this paper, we introduce an innovative\npotential-dependent surrogate gradient (PDSG) method to establish a robust\nconnection between the SG and the model, thereby enhancing the adaptability of\nadversarial attacks across various models with invisible SGs. Additionally, we\npropose the sparse dynamic attack (SDA) to effectively attack binary dynamic\nimages. Utilizing a generation-reduction paradigm, SDA can fully optimize the\nsparsity of adversarial perturbations. Experimental results demonstrate that\nour PDSG and SDA outperform state-of-the-art SNN-based attacks across various\nmodels and datasets. Specifically, our PDSG achieves 100% attack success rate\non ImageNet, and our SDA obtains 82% attack success rate by modifying only\n0.24% of the pixels on CIFAR10DVS. The code is available at\nhttps://github.com/ryime/PDSG-SDA .\n","authors":["Li Lun","Kunyu Feng","Qinglong Ni","Ling Liang","Yuan Wang","Ying Li","Dunshan Yu","Xiaoxin Cui"],"pdf_url":"https://arxiv.org/pdf/2503.03272v2.pdf","comment":"Accepted by CVPR 2025"},{"id":"http://arxiv.org/abs/2503.04420v1","updated":"2025-03-06T13:23:03Z","published":"2025-03-06T13:23:03Z","title":"PointsToWood: A deep learning framework for complete canopy leaf-wood\n segmentation of TLS data across diverse European forests","summary":" Point clouds from Terrestrial Laser Scanning (TLS) are an increasingly\npopular source of data for studying plant structure and function but typically\nrequire extensive manual processing to extract ecologically important\ninformation. One key task is the accurate semantic segmentation of different\nplant material within point clouds, particularly wood and leaves, which is\nrequired to understand plant productivity, architecture and physiology.\nExisting automated semantic segmentation methods are primarily developed for\nsingle ecosystem types, and whilst they show good accuracy for biomass\nassessment from the trunk and large branches, often perform less well within\nthe crown. In this study, we demonstrate a new framework that uses a deep\nlearning architecture newly developed from PointNet and pointNEXT for\nprocessing 3D point clouds to provide a reliable semantic segmentation of wood\nand leaf in TLS point clouds from the tree base to branch tips, trained on data\nfrom diverse mature European forests. Our model uses meticulously labelled data\ncombined with voxel-based sampling, neighbourhood rescaling, and a novel gated\nreflectance integration module embedded throughout the feature extraction\nlayers. We evaluate its performance across open datasets from boreal,\ntemperate, Mediterranean and tropical regions, encompassing diverse ecosystem\ntypes and sensor characteristics. Our results show consistent outperformance\nagainst the most widely used PointNet based approach for leaf/wood segmentation\non our high-density TLS dataset collected across diverse mixed forest plots\nacross all major biomes in Europe. We also find consistently strong performance\ntested on others open data from China, Eastern Cameroon, Germany and Finland,\ncollected using both time-of-flight and phase-shift sensors, showcasing the\ntransferability of our model to a wide range of ecosystems and sensors.\n","authors":["Harry J. F. Owen","Matthew J. A. Allen","Stuart W. D. Grieve","Phill Wilkes","Emily R. Lines"],"pdf_url":"https://arxiv.org/pdf/2503.04420v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08388v2","updated":"2025-03-06T13:19:58Z","published":"2024-09-12T20:34:34Z","title":"Continual Learning in 3D Point Clouds: Employing Spectral Techniques for\n Exemplar Selection","summary":" We introduce a novel framework for Continual Learning in 3D object\nclassification. Our approach, CL3D, is based on the selection of prototypes\nfrom each class using spectral clustering. For non-Euclidean data such as point\nclouds, spectral clustering can be employed as long as one can define a\ndistance measure between pairs of samples. Choosing the appropriate distance\nmeasure enables us to leverage 3D geometric characteristics to identify\nrepresentative prototypes for each class. We explore the effectiveness of\nclustering in the input space (3D points), local feature space\n(1024-dimensional points), and global feature space. We conduct experiments on\nthe ModelNet40, ShapeNet, and ScanNet datasets, achieving state-of-the-art\naccuracy exclusively through the use of input space features. By leveraging the\ncombined input, local, and global features, we have improved the\nstate-of-the-art on ModelNet and ShapeNet, utilizing nearly half the memory\nused by competing approaches. For the challenging ScanNet dataset, our method\nenhances accuracy by 4.1% while consuming just 28% of the memory used by our\ncompetitors, demonstrating the scalability of our approach.\n","authors":["Hossein Resani","Behrooz Nasihatkon","Mohammadreza Alimoradi Jazi"],"pdf_url":"https://arxiv.org/pdf/2409.08388v2.pdf","comment":"Accepted to WACV 2025, Tucson, Arizona, USA"},{"id":"http://arxiv.org/abs/2503.04416v1","updated":"2025-03-06T13:18:37Z","published":"2025-03-06T13:18:37Z","title":"Learning Transformer-based World Models with Contrastive Predictive\n Coding","summary":" The DreamerV3 algorithm recently obtained remarkable performance across\ndiverse environment domains by learning an accurate world model based on\nRecurrent Neural Networks (RNNs). Following the success of model-based\nreinforcement learning algorithms and the rapid adoption of the Transformer\narchitecture for its superior training efficiency and favorable scaling\nproperties, recent works such as STORM have proposed replacing RNN-based world\nmodels with Transformer-based world models using masked self-attention.\nHowever, despite the improved training efficiency of these methods, their\nimpact on performance remains limited compared to the Dreamer algorithm,\nstruggling to learn competitive Transformer-based world models. In this work,\nwe show that the next state prediction objective adopted in previous approaches\nis insufficient to fully exploit the representation capabilities of\nTransformers. We propose to extend world model predictions to longer time\nhorizons by introducing TWISTER (Transformer-based World model wIth contraSTivE\nRepresentations), a world model using action-conditioned Contrastive Predictive\nCoding to learn high-level temporal feature representations and improve the\nagent performance. TWISTER achieves a human-normalized mean score of 162% on\nthe Atari 100k benchmark, setting a new record among state-of-the-art methods\nthat do not employ look-ahead search.\n","authors":["Maxime Burchi","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2503.04416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03420v2","updated":"2025-03-06T12:52:29Z","published":"2024-05-06T12:40:15Z","title":"Implantable Adaptive Cells: A Novel Enhancement for Pre-Trained U-Nets\n in Medical Image Segmentation","summary":" This paper introduces a novel approach to enhance the performance of\npre-trained neural networks in medical image segmentation using gradient-based\nNeural Architecture Search (NAS) methods. We present the concept of Implantable\nAdaptive Cell (IAC), small modules identified through Partially-Connected DARTS\nbased approach, designed to be injected into the skip connections of an\nexisting and already trained U-shaped model. Unlike traditional NAS methods,\nour approach refines existing architectures without full retraining.\nExperiments on four medical datasets with MRI and CT images show consistent\naccuracy improvements on various U-Net configurations, with segmentation\naccuracy gain by approximately 5 percentage points across all validation\ndatasets, with improvements reaching up to 11\\%pt in the best-performing cases.\nThe findings of this study not only offer a cost-effective alternative to the\ncomplete overhaul of complex models for performance upgrades but also indicate\nthe potential applicability of our method to other architectures and problem\ndomains.\n","authors":["Emil Benedykciuk","Marcin Denkowski","Grzegorz Wójcik"],"pdf_url":"https://arxiv.org/pdf/2405.03420v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.20742v2","updated":"2025-03-06T12:50:44Z","published":"2025-02-28T05:47:34Z","title":"Structured Preference Optimization for Vision-Language Long-Horizon Task\n Planning","summary":" Existing methods for vision-language task planning excel in short-horizon\ntasks but often fall short in complex, long-horizon planning within dynamic\nenvironments. These challenges primarily arise from the difficulty of\neffectively training models to produce high-quality reasoning processes for\nlong-horizon tasks. To address this, we propose Structured Preference\nOptimization (SPO), which aims to enhance reasoning and action selection in\nlong-horizon task planning through structured preference evaluation and\noptimized training strategies. Specifically, SPO introduces: 1)\nPreference-Based Scoring and Optimization, which systematically evaluates\nreasoning chains based on task relevance, visual grounding, and historical\nconsistency; and 2) Curriculum-Guided Training, where the model progressively\nadapts from simple to complex tasks, improving its generalization ability in\nlong-horizon scenarios and enhancing reasoning robustness. To advance research\nin vision-language long-horizon task planning, we introduce ExtendaBench, a\ncomprehensive benchmark covering 1,509 tasks across VirtualHome and Habitat\n2.0, categorized into ultra-short, short, medium, and long tasks. Experimental\nresults demonstrate that SPO significantly improves reasoning quality and final\ndecision accuracy, outperforming prior methods on long-horizon tasks and\nunderscoring the effectiveness of preference-driven optimization in\nvision-language task planning. Specifically, SPO achieves a +5.98% GCR and\n+4.68% SR improvement in VirtualHome and a +3.30% GCR and +2.11% SR improvement\nin Habitat over the best-performing baselines.\n","authors":["Xiwen Liang","Min Lin","Weiqi Ruan","Rongtao Xu","Yuecheng Liu","Jiaqi Chen","Bingqian Lin","Yuzheng Zhuang","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2502.20742v2.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2503.03285v2","updated":"2025-03-06T12:42:37Z","published":"2025-03-05T09:12:16Z","title":"Enhancing Vietnamese VQA through Curriculum Learning on Raw and\n Augmented Text Representations","summary":" Visual Question Answering (VQA) is a multimodal task requiring reasoning\nacross textual and visual inputs, which becomes particularly challenging in\nlow-resource languages like Vietnamese due to linguistic variability and the\nlack of high-quality datasets. Traditional methods often rely heavily on\nextensive annotated datasets, computationally expensive pipelines, and large\npre-trained models, specifically in the domain of Vietnamese VQA, limiting\ntheir applicability in such scenarios. To address these limitations, we propose\na training framework that combines a paraphrase-based feature augmentation\nmodule with a dynamic curriculum learning strategy. Explicitly, augmented\nsamples are considered \"easy\" while raw samples are regarded as \"hard\". The\nframework then utilizes a mechanism that dynamically adjusts the ratio of easy\nto hard samples during training, progressively modifying the same dataset to\nincrease its difficulty level. By enabling gradual adaptation to task\ncomplexity, this approach helps the Vietnamese VQA model generalize well, thus\nimproving overall performance. Experimental results show consistent\nimprovements on the OpenViVQA dataset and mixed outcomes on the ViVQA dataset,\nhighlighting both the potential and challenges of our approach in advancing VQA\nfor Vietnamese language.\n","authors":["Khoi Anh Nguyen","Linh Yen Vu","Thang Dinh Duong","Thuan Nguyen Duong","Huy Thanh Nguyen","Vinh Quang Dinh"],"pdf_url":"https://arxiv.org/pdf/2503.03285v2.pdf","comment":"10 pages, 3 figures, AAAI-25 Workshop on Document Understanding and\n Intelligence"},{"id":"http://arxiv.org/abs/2503.04385v1","updated":"2025-03-06T12:36:35Z","published":"2025-03-06T12:36:35Z","title":"Scale-Invariant Adversarial Attack against Arbitrary-scale\n Super-resolution","summary":" The advent of local continuous image function (LIIF) has garnered significant\nattention for arbitrary-scale super-resolution (SR) techniques. However, while\nthe vulnerabilities of fixed-scale SR have been assessed, the robustness of\ncontinuous representation-based arbitrary-scale SR against adversarial attacks\nremains an area warranting further exploration. The elaborately designed\nadversarial attacks for fixed-scale SR are scale-dependent, which will cause\ntime-consuming and memory-consuming problems when applied to arbitrary-scale\nSR. To address this concern, we propose a simple yet effective\n``scale-invariant'' SR adversarial attack method with good transferability,\ntermed SIAGT. Specifically, we propose to construct resource-saving attacks by\nexploiting finite discrete points of continuous representation. In addition, we\nformulate a coordinate-dependent loss to enhance the cross-model\ntransferability of the attack. The attack can significantly deteriorate the SR\nimages while introducing imperceptible distortion to the targeted\nlow-resolution (LR) images. Experiments carried out on three popular LIIF-based\nSR approaches and four classical SR datasets show remarkable attack performance\nand transferability of SIAGT.\n","authors":["Yihao Huang","Xin Luo","Qing Guo","Felix Juefei-Xu","Xiaojun Jia","Weikai Miao","Geguang Pu","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2503.04385v1.pdf","comment":"15 pages, accepted by TIFS 2025"},{"id":"http://arxiv.org/abs/2503.04376v1","updated":"2025-03-06T12:27:58Z","published":"2025-03-06T12:27:58Z","title":"MIDAS: Modeling Ground-Truth Distributions with Dark Knowledge for\n Domain Generalized Stereo Matching","summary":" Despite the significant advances in domain generalized stereo matching,\nexisting methods still exhibit domain-specific preferences when transferring\nfrom synthetic to real domains, hindering their practical applications in\ncomplex and diverse scenarios. The probability distributions predicted by the\nstereo network naturally encode rich similarity and uncertainty information.\nInspired by this observation, we propose to extract these two types of dark\nknowledge from the pre-trained network to model intuitive multi-modal\nground-truth distributions for both edge and non-edge regions. To mitigate the\ninherent domain preferences of a single network, we adopt network ensemble and\nfurther distinguish between objective and biased knowledge in the Laplace\nparameter space. Finally, the objective knowledge and the original disparity\nlabels are jointly modeled as a mixture of Laplacians to provide fine-grained\nsupervision for the stereo network training. Extensive experiments demonstrate\nthat: 1) Our method is generic and effectively improves the generalization of\nexisting networks. 2) PCWNet with our method achieves the state-of-the-art\ngeneralization performance on both KITTI 2015 and 2012 datasets. 3) Our method\noutperforms existing methods in comprehensive ranking across four popular\nreal-world datasets.\n","authors":["Peng Xu","Zhiyu Xiang","Jingyun Fu","Tianyu Pu","Hanzhi Zhong","Eryun Liu"],"pdf_url":"https://arxiv.org/pdf/2503.04376v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08824v3","updated":"2025-03-06T12:26:08Z","published":"2024-09-13T13:37:33Z","title":"Pathfinder for Low-altitude Aircraft with Binary Neural Network","summary":" A prior global topological map (e.g., the OpenStreetMap, OSM) can boost the\nperformance of autonomous mapping by a ground mobile robot. However, the prior\nmap is usually incomplete due to lacking labeling in partial paths. To solve\nthis problem, this paper proposes an OSM maker using airborne sensors carried\nby low-altitude aircraft, where the core of the OSM maker is a novel efficient\npathfinder approach based on LiDAR and camera data, i.e., a binary dual-stream\nroad segmentation model. Specifically, a multi-scale feature extraction based\non the UNet architecture is implemented for images and point clouds. To reduce\nthe effect caused by the sparsity of point cloud, an attention-guided gated\nblock is designed to integrate image and point-cloud features. To optimize the\nmodel for edge deployment that significantly reduces storage footprint and\ncomputational demands, we propose a binarization streamline to each model\ncomponent, including a variant of vision transformer (ViT) architecture as the\nencoder of the image branch, and new focal and perception losses to optimize\nthe model training. The experimental results on two datasets demonstrate that\nour pathfinder method achieves SOTA accuracy with high efficiency in finding\npaths from the low-level airborne sensors, and we can create complete OSM prior\nmaps based on the segmented road skeletons. Code and data are available at:\n\\href{https://github.com/IMRL/Pathfinder}{https://github.com/IMRL/Pathfinder}.\n","authors":["Kaijie Yin","Tian Gao","Hui Kong"],"pdf_url":"https://arxiv.org/pdf/2409.08824v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.16532v2","updated":"2025-03-06T12:19:59Z","published":"2025-02-23T10:48:11Z","title":"Deep unrolling for learning optimal spatially varying regularisation\n parameters for Total Generalised Variation","summary":" We extend a recently introduced deep unrolling framework for learning\nspatially varying regularisation parameters in inverse imaging problems to the\ncase of Total Generalised Variation (TGV). The framework combines a deep\nconvolutional neural network (CNN) inferring the two spatially varying TGV\nparameters with an unrolled algorithmic scheme that solves the corresponding\nvariational problem. The two subnetworks are jointly trained end-to-end in a\nsupervised fashion and as such the CNN learns to compute those parameters that\ndrive the reconstructed images as close to the ground truth as possible.\nNumerical results in image denoising and MRI reconstruction show a significant\nqualitative and quantitative improvement compared to the best TGV scalar\nparameter case as well as to other approaches employing spatially varying\nparameters computed by unsupervised methods. We also observe that the inferred\nspatially varying parameter maps have a consistent structure near the image\nedges, asking for further theoretical investigations. In particular, the\nparameter that weighs the first-order TGV term has a triple-edge structure with\nalternating high-low-high values whereas the one that weighs the second-order\nterm attains small values in a large neighbourhood around the edges.\n","authors":["Thanh Trung Vu","Andreas Kofler","Kostas Papafitsoros"],"pdf_url":"https://arxiv.org/pdf/2502.16532v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10329v2","updated":"2025-03-06T12:16:09Z","published":"2024-09-16T14:39:15Z","title":"InfoDisent: Explainability of Image Classification Models by Information\n Disentanglement","summary":" In this work, we introduce InfoDisent, a hybrid approach to explainability\nbased on the information bottleneck principle. InfoDisent enables the\ndisentanglement of information in the final layer of any pretrained model into\natomic concepts, which can be interpreted as prototypical parts. This approach\nmerges the flexibility of post-hoc methods with the concept-level modeling\ncapabilities of self-explainable neural networks, such as ProtoPNets. We\ndemonstrate the effectiveness of InfoDisent through computational experiments\nand user studies across various datasets using modern backbones such as ViTs\nand convolutional networks. Notably, InfoDisent generalizes the prototypical\nparts approach to novel domains (ImageNet).\n","authors":["Łukasz Struski","Dawid Rymarczyk","Jacek Tabor"],"pdf_url":"https://arxiv.org/pdf/2409.10329v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01334v3","updated":"2025-03-06T11:59:11Z","published":"2024-08-02T15:32:42Z","title":"A Backbone for Long-Horizon Robot Task Understanding","summary":" End-to-end robot learning, particularly for long-horizon tasks, often results\nin unpredictable outcomes and poor generalization. To address these challenges,\nwe propose a novel Therblig-Based Backbone Framework (TBBF) as a fundamental\nstructure to enhance interpretability, data efficiency, and generalization in\nrobotic systems. TBBF utilizes expert demonstrations to enable therblig-level\ntask decomposition, facilitate efficient action-object mapping, and generate\nadaptive trajectories for new scenarios. The approach consists of two stages:\noffline training and online testing. During the offline training stage, we\ndeveloped the Meta-RGate SynerFusion (MGSF) network for accurate therblig\nsegmentation across various tasks. In the online testing stage, after a\none-shot demonstration of a new task is collected, our MGSF network extracts\nhigh-level knowledge, which is then encoded into the image using Action\nRegistration (ActionREG). Additionally, Large Language Model (LLM)-Alignment\nPolicy for Visual Correction (LAP-VC) is employed to ensure precise action\nregistration, facilitating trajectory transfer in novel robot scenarios.\nExperimental results validate these methods, achieving 94.37% recall in\ntherblig segmentation and success rates of 94.4% and 80% in real-world online\nrobot testing for simple and complex scenarios, respectively. Supplementary\nmaterial is available at:\nhttps://sites.google.com/view/therbligsbasedbackbone/home\n","authors":["Xiaoshuai Chen","Wei Chen","Dongmyoung Lee","Yukun Ge","Nicolas Rojas","Petar Kormushev"],"pdf_url":"https://arxiv.org/pdf/2408.01334v3.pdf","comment":"8 pages, 8 figures. This work has been published by IEEE Robotics and\n Automation Letters (RA-L)"},{"id":"http://arxiv.org/abs/2503.04353v1","updated":"2025-03-06T11:55:44Z","published":"2025-03-06T11:55:44Z","title":"ObjMST: An Object-Focused Multimodal Style Transfer Framework","summary":" We propose ObjMST, an object-focused multimodal style transfer framework that\nprovides separate style supervision for salient objects and surrounding\nelements while addressing alignment issues in multimodal representation\nlearning. Existing image-text multimodal style transfer methods face the\nfollowing challenges: (1) generating non-aligned and inconsistent multimodal\nstyle representations; and (2) content mismatch, where identical style patterns\nare applied to both salient objects and their surrounding elements. Our\napproach mitigates these issues by: (1) introducing a Style-Specific Masked\nDirectional CLIP Loss, which ensures consistent and aligned style\nrepresentations for both salient objects and their surroundings; and (2)\nincorporating a salient-to-key mapping mechanism for stylizing salient objects,\nfollowed by image harmonization to seamlessly blend the stylized objects with\ntheir environment. We validate the effectiveness of ObjMST through experiments,\nusing both quantitative metrics and qualitative visual evaluations of the\nstylized outputs. Our code is available at:\nhttps://github.com/chandagrover/ObjMST.\n","authors":["Chanda Grover Kamra","Indra Deep Mastan","Debayan Gupta"],"pdf_url":"https://arxiv.org/pdf/2503.04353v1.pdf","comment":"8 pages, 8 Figures, 3 Tables"},{"id":"http://arxiv.org/abs/2503.04351v1","updated":"2025-03-06T11:49:43Z","published":"2025-03-06T11:49:43Z","title":"PLMP -- Point-Line Minimal Problems for Projective SfM","summary":" We completely classify all minimal problems for Structure-from-Motion (SfM)\nwhere arrangements of points and lines are fully observed by multiple\nuncalibrated pinhole cameras. We find 291 minimal problems, 73 of which have\nunique solutions and can thus be solved linearly. Two of the linear problems\nallow an arbitrary number of views, while all other minimal problems have at\nmost 9 cameras. All minimal problems have at most 7 points and at most 12\nlines. We compute the number of solutions of each minimal problem, as this\ngives a measurement of the problem's intrinsic difficulty, and find that these\nnumber are relatively low (e.g., when comparing with minimal problems for\ncalibrated cameras). Finally, by exploring stabilizer subgroups of\nsubarrangements, we develop a geometric and systematic way to 1) factorize\nminimal problems into smaller problems, 2) identify minimal problems in\nunderconstrained problems, and 3) formally prove non-minimality.\n","authors":["Kim Kiehn","Albin Ahlbäck","Kathlén Kohn"],"pdf_url":"https://arxiv.org/pdf/2503.04351v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04344v1","updated":"2025-03-06T11:41:36Z","published":"2025-03-06T11:41:36Z","title":"LEDiT: Your Length-Extrapolatable Diffusion Transformer without\n Positional Encoding","summary":" Diffusion transformers(DiTs) struggle to generate images at resolutions\nhigher than their training resolutions. The primary obstacle is that the\nexplicit positional encodings(PE), such as RoPE, need extrapolation which\ndegrades performance when the inference resolution differs from training. In\nthis paper, we propose a Length-Extrapolatable Diffusion Transformer(LEDiT), a\nsimple yet powerful architecture to overcome this limitation. LEDiT needs no\nexplicit PEs, thereby avoiding extrapolation. The key innovations of LEDiT are\nintroducing causal attention to implicitly impart global positional information\nto tokens, while enhancing locality to precisely distinguish adjacent tokens.\nExperiments on 256x256 and 512x512 ImageNet show that LEDiT can scale the\ninference resolution to 512x512 and 1024x1024, respectively, while achieving\nbetter image quality compared to current state-of-the-art length extrapolation\nmethods(NTK-aware, YaRN). Moreover, LEDiT achieves strong extrapolation\nperformance with just 100K steps of fine-tuning on a pretrained DiT,\ndemonstrating its potential for integration into existing text-to-image DiTs.\n","authors":["Shen Zhang","Yaning Tan","Siyuan Liang","Linze Li","Ge Wu","Yuhao Chen","Shuheng Li","Zhenyu Zhao","Caihua Chen","Jiajun Liang","Yao Tang"],"pdf_url":"https://arxiv.org/pdf/2503.04344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03644v2","updated":"2025-03-06T11:36:33Z","published":"2025-03-05T16:20:53Z","title":"DongbaMIE: A Multimodal Information Extraction Dataset for Evaluating\n Semantic Understanding of Dongba Pictograms","summary":" Dongba pictographs are the only pictographs still in use in the world. They\nhave pictorial ideographic features, and their symbols carry rich cultural and\ncontextual information. Due to the lack of relevant datasets, existing research\nhas difficulty in advancing the study of semantic understanding of Dongba\npictographs. To this end, we propose DongbaMIE, the first multimodal dataset\nfor semantic understanding and extraction of Dongba pictographs. The dataset\nconsists of Dongba pictograph images and their corresponding Chinese semantic\nannotations. It contains 23,530 sentence-level and 2,539 paragraph-level\nimages, covering four semantic dimensions: objects, actions, relations, and\nattributes. We systematically evaluate the GPT-4o, Gemini-2.0, and Qwen2-VL\nmodels. Experimental results show that the F1 scores of GPT-4o and Gemini in\nthe best object extraction are only 3.16 and 3.11 respectively. The F1 score of\nQwen2-VL after supervised fine-tuning is only 11.49. These results suggest that\ncurrent large multimodal models still face significant challenges in accurately\nrecognizing the diverse semantic information in Dongba pictographs. The dataset\ncan be obtained from this URL.\n","authors":["Xiaojun Bi","Shuo Li","Ziyue Wang","Fuwen Luo","Weizheng Qiao","Lu Han","Ziwei Sun","Peng Li","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2503.03644v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04333v1","updated":"2025-03-06T11:31:08Z","published":"2025-03-06T11:31:08Z","title":"GaussianVideo: Efficient Video Representation and Compression by\n Gaussian Splatting","summary":" Implicit Neural Representation for Videos (NeRV) has introduced a novel\nparadigm for video representation and compression, outperforming traditional\ncodecs. As model size grows, however, slow encoding and decoding speed and high\nmemory consumption hinder its application in practice. To address these\nlimitations, we propose a new video representation and compression method based\non 2D Gaussian Splatting to efficiently handle video data. Our proposed\ndeformable 2D Gaussian Splatting dynamically adapts the transformation of 2D\nGaussians at each frame, significantly reducing memory cost. Equipped with a\nmulti-plane-based spatiotemporal encoder and a lightweight decoder, it predicts\nchanges in color, coordinates, and shape of initialized Gaussians, given the\ntime step. By leveraging temporal gradients, our model effectively captures\ntemporal redundancy at negligible cost, significantly enhancing video\nrepresentation efficiency. Our method reduces GPU memory usage by up to 78.4%,\nand significantly expedites video processing, achieving 5.5x faster training\nand 12.5x faster decoding compared to the state-of-the-art NeRV methods.\n","authors":["Inseo Lee","Youngyoon Choi","Joonseok Lee"],"pdf_url":"https://arxiv.org/pdf/2503.04333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04325v1","updated":"2025-03-06T11:18:22Z","published":"2025-03-06T11:18:22Z","title":"GBT-SAM: A Parameter-Efficient Depth-Aware Model for Generalizable Brain\n tumour Segmentation on mp-MRI","summary":" Gliomas are brain tumours that stand out for their highly lethal and\naggressive nature, which demands a precise approach in their diagnosis. Medical\nimage segmentation plays a crucial role in the evaluation and follow-up of\nthese tumours, allowing specialists to analyse their morphology. However,\nexisting methods for automatic glioma segmentation often lack generalization\ncapability across other brain tumour domains, require extensive computational\nresources, or fail to fully utilize the multi-parametric MRI (mp-MRI) data used\nto delineate them. In this work, we introduce GBT-SAM, a novel Generalizable\nBrain Tumour (GBT) framework that extends the Segment Anything Model (SAM) to\nbrain tumour segmentation tasks. Our method employs a two-step training\nprotocol: first, fine-tuning the patch embedding layer to process the entire\nmp-MRI modalities, and second, incorporating parameter-efficient LoRA blocks\nand a Depth-Condition block into the Vision Transformer (ViT) to capture\ninter-slice correlations. GBT-SAM achieves state-of-the-art performance on the\nAdult Glioma dataset (Dice Score of $93.54$) while demonstrating robust\ngeneralization across Meningioma, Pediatric Glioma, and Sub-Saharan Glioma\ndatasets. Furthermore, GBT-SAM uses less than 6.5M trainable parameters, thus\noffering an efficient solution for brain tumour segmentation. \\\\ Our code and\nmodels are available at https://github.com/vpulab/med-sam-brain .\n","authors":["Cecilia Diana-Albelda","Roberto Alcover-Couso","Álvaro García-Martín","Jesus Bescos","Marcos Escudero-Viñolo"],"pdf_url":"https://arxiv.org/pdf/2503.04325v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.17634v2","updated":"2025-03-06T11:17:31Z","published":"2025-01-29T13:11:21Z","title":"Federated Learning With Individualized Privacy Through Client Sampling","summary":" With growing concerns about user data collection, individualized privacy has\nemerged as a promising solution to balance protection and utility by accounting\nfor diverse user privacy preferences. Instead of enforcing a uniform level of\nanonymization for all users, this approach allows individuals to choose privacy\nsettings that align with their comfort levels. Building on this idea, we\npropose an adapted method for enabling Individualized Differential Privacy\n(IDP) in Federated Learning (FL) by handling clients according to their\npersonal privacy preferences. By extending the SAMPLE algorithm from\ncentralized settings to FL, we calculate client-specific sampling rates based\non their heterogeneous privacy budgets and integrate them into a modified\nIDP-FedAvg algorithm. We test this method under realistic privacy distributions\nand multiple datasets. The experimental results demonstrate that our approach\nachieves clear improvements over uniform DP baselines, reducing the trade-off\nbetween privacy and utility. Compared to the alternative SCALE method in\nrelated work, which assigns differing noise scales to clients, our method\nperforms notably better. However, challenges remain for complex tasks with\nnon-i.i.d. data, primarily stemming from the constraints of the decentralized\nsetting.\n","authors":["Lucas Lange","Ole Borchardt","Erhard Rahm"],"pdf_url":"https://arxiv.org/pdf/2501.17634v2.pdf","comment":"Accepted at 10th International Conference on Machine Learning\n Technologies (ICMLT 2025)"},{"id":"http://arxiv.org/abs/2503.04322v1","updated":"2025-03-06T11:14:59Z","published":"2025-03-06T11:14:59Z","title":"A Modular Pipeline for 3D Object Tracking Using RGB Cameras","summary":" Object tracking is a key challenge of computer vision with various\napplications that all require different architectures. Most tracking systems\nhave limitations such as constraining all movement to a 2D plane and they often\ntrack only one object. In this paper, we present a new modular pipeline that\ncalculates 3D trajectories of multiple objects. It is adaptable to various\nsettings where multiple time-synced and stationary cameras record moving\nobjects, using off the shelf webcams. Our pipeline was tested on the Table\nSetting Dataset, where participants are recorded with various sensors as they\nset a table with tableware objects. We need to track these manipulated objects,\nusing 6 rgb webcams. Challenges include: Detecting small objects in 9.874.699\ncamera frames, determining camera poses, discriminating between nearby and\noverlapping objects, temporary occlusions, and finally calculating a 3D\ntrajectory using the right subset of an average of 11.12.456 pixel coordinates\nper 3-minute trial. We implement a robust pipeline that results in accurate\ntrajectories with covariance of x,y,z-position as a confidence metric. It deals\ndynamically with appearing and disappearing objects, instantiating new Extended\nKalman Filters. It scales to hundreds of table-setting trials with very little\nhuman annotation input, even with the camera poses of each trial unknown. The\ncode is available at https://github.com/LarsBredereke/object_tracking\n","authors":["Lars Bredereke","Yale Hartmann","Tanja Schultz"],"pdf_url":"https://arxiv.org/pdf/2503.04322v1.pdf","comment":"9 pages, 11 figures, original paper not to be published anywhere else"},{"id":"http://arxiv.org/abs/2501.16981v3","updated":"2025-03-06T11:08:38Z","published":"2025-01-28T14:28:55Z","title":"Modulating CNN Features with Pre-Trained ViT Representations for\n Open-Vocabulary Object Detection","summary":" Owing to large-scale image-text contrastive training, pre-trained vision\nlanguage model (VLM) like CLIP shows superior open-vocabulary recognition\nability. Most existing open-vocabulary object detectors attempt to utilize the\npre-trained VLMs to attain generalized representation. F-ViT uses the\npre-trained visual encoder as the backbone network and freezes it during\ntraining. However, its frozen backbone doesn't benefit from the labeled data to\nstrengthen the representation for detection. Therefore, we propose a novel\ntwo-branch backbone network, named as \\textbf{V}iT-Feature-\\textbf{M}odulated\nMulti-Scale \\textbf{C}onvolutional Network (VMCNet), which consists of a\ntrainable convolutional branch, a frozen pre-trained ViT branch and a VMC\nmodule. The trainable CNN branch could be optimized with labeled data while the\nfrozen pre-trained ViT branch could keep the representation ability derived\nfrom large-scale pre-training. Then, the proposed VMC module could modulate the\nmulti-scale CNN features with the representations from ViT branch. With this\nproposed mixed structure, the detector is more likely to discover objects of\nnovel categories. Evaluated on two popular benchmarks, our method boosts the\ndetection performance on novel category and outperforms state-of-the-art\nmethods. On OV-COCO, the proposed method achieves 44.3\nAP$_{50}^{\\mathrm{novel}}$ with ViT-B/16 and 48.5 AP$_{50}^{\\mathrm{novel}}$\nwith ViT-L/14. On OV-LVIS, VMCNet with ViT-B/16 and ViT-L/14 reaches 27.8 and\n38.4 mAP$_{r}$.\n","authors":["Xiangyu Gao","Yu Dai","Benliu Qiu","Lanxiao Wang","Heqian Qiu","Hongliang Li"],"pdf_url":"https://arxiv.org/pdf/2501.16981v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01879v3","updated":"2025-03-06T11:05:33Z","published":"2024-02-02T20:08:11Z","title":"$σ$-zero: Gradient-based Optimization of $\\ell_0$-norm Adversarial\n Examples","summary":" Evaluating the adversarial robustness of deep networks to gradient-based\nattacks is challenging. While most attacks consider $\\ell_2$- and\n$\\ell_\\infty$-norm constraints to craft input perturbations, only a few\ninvestigate sparse $\\ell_1$- and $\\ell_0$-norm attacks. In particular,\n$\\ell_0$-norm attacks remain the least studied due to the inherent complexity\nof optimizing over a non-convex and non-differentiable constraint. However,\nevaluating adversarial robustness under these attacks could reveal weaknesses\notherwise left untested with more conventional $\\ell_2$- and $\\ell_\\infty$-norm\nattacks. In this work, we propose a novel $\\ell_0$-norm attack, called\n$\\sigma$-zero, which leverages a differentiable approximation of the $\\ell_0$\nnorm to facilitate gradient-based optimization, and an adaptive projection\noperator to dynamically adjust the trade-off between loss minimization and\nperturbation sparsity. Extensive evaluations using MNIST, CIFAR10, and ImageNet\ndatasets, involving robust and non-robust models, show that\n$\\sigma$\\texttt{-zero} finds minimum $\\ell_0$-norm adversarial examples without\nrequiring any time-consuming hyperparameter tuning, and that it outperforms all\ncompeting sparse attacks in terms of success rate, perturbation size, and\nefficiency.\n","authors":["Antonio Emanuele Cinà","Francesco Villani","Maura Pintor","Lea Schönherr","Battista Biggio","Marcello Pelillo"],"pdf_url":"https://arxiv.org/pdf/2402.01879v3.pdf","comment":"Paper accepted at International Conference on Learning\n Representations (ICLR 2025). Code available at\n https://github.com/sigma0-advx/sigma-zero"},{"id":"http://arxiv.org/abs/2412.00156v3","updated":"2025-03-06T11:05:32Z","published":"2024-11-29T08:10:49Z","title":"VISION-XL: High Definition Video Inverse Problem Solver using Latent\n Image Diffusion Models","summary":" In this paper, we propose a novel framework for solving high-definition video\ninverse problems using latent image diffusion models. Building on recent\nadvancements in spatio-temporal optimization for video inverse problems using\nimage diffusion models, our approach leverages latent-space diffusion models to\nachieve enhanced video quality and resolution. To address the high\ncomputational demands of processing high-resolution frames, we introduce a\npseudo-batch consistent sampling strategy, allowing efficient operation on a\nsingle GPU. Additionally, to improve temporal consistency, we present\npseudo-batch inversion, an initialization technique that incorporates\ninformative latents from the measurement. By integrating with SDXL, our\nframework achieves state-of-the-art video reconstruction across a wide range of\nspatio-temporal inverse problems, including complex combinations of frame\naveraging and various spatial degradations, such as deblurring,\nsuper-resolution, and inpainting. Unlike previous methods, our approach\nsupports multiple aspect ratios (landscape, vertical, and square) and delivers\nHD-resolution reconstructions (exceeding 1280x720) in under 6 seconds per frame\non a single NVIDIA 4090 GPU.\n","authors":["Taesung Kwon","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2412.00156v3.pdf","comment":"Project page: https://vision-xl.github.io/"},{"id":"http://arxiv.org/abs/2501.10814v2","updated":"2025-03-06T11:05:23Z","published":"2025-01-18T16:23:09Z","title":"No More Sliding Window: Efficient 3D Medical Image Segmentation with\n Differentiable Top-k Patch Sampling","summary":" 3D models surpass 2D models in CT/MRI segmentation by effectively capturing\ninter-slice relationships. However, the added depth dimension substantially\nincreases memory consumption. While patch-based training alleviates memory\nconstraints, it significantly slows down the inference speed due to the sliding\nwindow (SW) approach. We propose No-More-Sliding-Window (NMSW), a novel\nend-to-end trainable framework that enhances the efficiency of generic 3D\nsegmentation backbone during an inference step by eliminating the need for SW.\nNMSW employs a differentiable Top-k module to selectively sample only the most\nrelevant patches, thereby minimizing redundant computations. When patch-level\npredictions are insufficient, the framework intelligently leverages coarse\nglobal predictions to refine results. Evaluated across 3 tasks using 3\nsegmentation backbones, NMSW achieves competitive accuracy compared to SW\ninference while significantly reducing computational complexity by 91% (88.0 to\n8.00 TMACs). Moreover, it delivers a 9.1x faster inference on the H100 GPU\n(99.0 to 8.3 sec) and a 11.1x faster inference on the Xeon Gold CPU (2110 to\n189 sec). NMSW is model-agnostic, further boosting efficiency when integrated\nwith any existing efficient segmentation backbones.\n","authors":["Young Seok Jeon","Hongfei Yang","Huazhu Fu","Mengling Feng"],"pdf_url":"https://arxiv.org/pdf/2501.10814v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04314v1","updated":"2025-03-06T10:58:26Z","published":"2025-03-06T10:58:26Z","title":"S2Gaussian: Sparse-View Super-Resolution 3D Gaussian Splatting","summary":" In this paper, we aim ambitiously for a realistic yet challenging problem,\nnamely, how to reconstruct high-quality 3D scenes from sparse low-resolution\nviews that simultaneously suffer from deficient perspectives and clarity.\nWhereas existing methods only deal with either sparse views or low-resolution\nobservations, they fail to handle such hybrid and complicated scenarios. To\nthis end, we propose a novel Sparse-view Super-resolution 3D Gaussian Splatting\nframework, dubbed S2Gaussian, that can reconstruct structure-accurate and\ndetail-faithful 3D scenes with only sparse and low-resolution views. The\nS2Gaussian operates in a two-stage fashion. In the first stage, we initially\noptimize a low-resolution Gaussian representation with depth regularization and\ndensify it to initialize the high-resolution Gaussians through a tailored\nGaussian Shuffle Split operation. In the second stage, we refine the\nhigh-resolution Gaussians with the super-resolved images generated from both\noriginal sparse views and pseudo-views rendered by the low-resolution\nGaussians. In which a customized blur-free inconsistency modeling scheme and a\n3D robust optimization strategy are elaborately designed to mitigate multi-view\ninconsistency and eliminate erroneous updates caused by imperfect supervision.\nExtensive experiments demonstrate superior results and in particular\nestablishing new state-of-the-art performances with more consistent geometry\nand finer details.\n","authors":["Yecong Wan","Mingwen Shao","Yuanshuo Cheng","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2503.04314v1.pdf","comment":"CVPR 2025"},{"id":"http://arxiv.org/abs/2503.04308v1","updated":"2025-03-06T10:51:04Z","published":"2025-03-06T10:51:04Z","title":"Shaken, Not Stirred: A Novel Dataset for Visual Understanding of Glasses\n in Human-Robot Bartending Tasks","summary":" Datasets for object detection often do not account for enough variety of\nglasses, due to their transparent and reflective properties. Specifically,\nopen-vocabulary object detectors, widely used in embodied robotic agents, fail\nto distinguish subclasses of glasses. This scientific gap poses an issue to\nrobotic applications that suffer from accumulating errors between detection,\nplanning, and action execution. The paper introduces a novel method for the\nacquisition of real-world data from RGB-D sensors that minimizes human effort.\nWe propose an auto-labeling pipeline that generates labels for all the acquired\nframes based on the depth measurements. We provide a novel real-world glass\nobject dataset that was collected on the Neuro-Inspired COLlaborator (NICOL), a\nhumanoid robot platform. The data set consists of 7850 images recorded from\nfive different cameras. We show that our trained baseline model outperforms\nstate-of-the-art open-vocabulary approaches. In addition, we deploy our\nbaseline model in an embodied agent approach to the NICOL platform, on which it\nachieves a success rate of 81% in a human-robot bartending scenario.\n","authors":["Lukáš Gajdošech","Hassan Ali","Jan-Gerrit Habekost","Martin Madaras","Matthias Kerzel","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2503.04308v1.pdf","comment":"Submitted to IEEE/RSJ International Conference on Intelligent Robots\n and Systems (IROS) 2025"},{"id":"http://arxiv.org/abs/2412.01615v3","updated":"2025-03-06T10:49:58Z","published":"2024-12-02T15:38:44Z","title":"OmniGuard: Hybrid Manipulation Localization via Augmented Versatile Deep\n Image Watermarking","summary":" With the rapid growth of generative AI and its widespread application in\nimage editing, new risks have emerged regarding the authenticity and integrity\nof digital content. Existing versatile watermarking approaches suffer from\ntrade-offs between tamper localization precision and visual quality.\nConstrained by the limited flexibility of previous framework, their localized\nwatermark must remain fixed across all images. Under AIGC-editing, their\ncopyright extraction accuracy is also unsatisfactory. To address these\nchallenges, we propose OmniGuard, a novel augmented versatile watermarking\napproach that integrates proactive embedding with passive, blind extraction for\nrobust copyright protection and tamper localization. OmniGuard employs a hybrid\nforensic framework that enables flexible localization watermark selection and\nintroduces a degradation-aware tamper extraction network for precise\nlocalization under challenging conditions. Additionally, a lightweight\nAIGC-editing simulation layer is designed to enhance robustness across global\nand local editing. Extensive experiments show that OmniGuard achieves superior\nfidelity, robustness, and flexibility. Compared to the recent state-of-the-art\napproach EditGuard, our method outperforms it by 4.25dB in PSNR of the\ncontainer image, 20.7% in F1-Score under noisy conditions, and 14.8% in average\nbit accuracy.\n","authors":["Xuanyu Zhang","Zecheng Tang","Zhipei Xu","Runyi Li","Youmin Xu","Bin Chen","Feng Gao","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.01615v3.pdf","comment":"Accepted by CVPR 2025"},{"id":"http://arxiv.org/abs/2503.03370v2","updated":"2025-03-06T10:41:28Z","published":"2025-03-05T10:46:03Z","title":"MIAdapt: Source-free Few-shot Domain Adaptive Object Detection for\n Microscopic Images","summary":" Existing generic unsupervised domain adaptation approaches require access to\nboth a large labeled source dataset and a sufficient unlabeled target dataset\nduring adaptation. However, collecting a large dataset, even if unlabeled, is a\nchallenging and expensive endeavor, especially in medical imaging. In addition,\nconstraints such as privacy issues can result in cases where source data is\nunavailable. Taking in consideration these challenges, we propose MIAdapt, an\nadaptive approach for Microscopic Imagery Adaptation as a solution for\nSource-free Few-shot Domain Adaptive Object detection (SF-FSDA). We also define\ntwo competitive baselines (1) Faster-FreeShot and (2) MT-FreeShot. Extensive\nexperiments on the challenging M5-Malaria and Raabin-WBC datasets validate the\neffectiveness of MIAdapt. Without using any image from the source domain\nMIAdapt surpasses state-of-the-art source-free UDA (SF-UDA) methods by +21.3%\nmAP and few-shot domain adaptation (FSDA) approaches by +4.7% mAP on\nRaabin-WBC. Our code and models will be publicly available.\n","authors":["Nimra Dilawar","Sara Nadeem","Javed Iqbal","Waqas Sultani","Mohsen Ali"],"pdf_url":"https://arxiv.org/pdf/2503.03370v2.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.03286v2","updated":"2025-03-06T10:12:31Z","published":"2023-12-06T04:32:38Z","title":"Indirect Gradient Matching for Adversarial Robust Distillation","summary":" Adversarial training significantly improves adversarial robustness, but\nsuperior performance is primarily attained with large models. This substantial\nperformance gap for smaller models has spurred active research into adversarial\ndistillation (AD) to mitigate the difference. Existing AD methods leverage the\nteacher's logits as a guide. In contrast to these approaches, we aim to\ntransfer another piece of knowledge from the teacher, the input gradient. In\nthis paper, we propose a distillation module termed Indirect Gradient\nDistillation Module (IGDM) that indirectly matches the student's input gradient\nwith that of the teacher. Experimental results show that IGDM seamlessly\nintegrates with existing AD methods, significantly enhancing their performance.\nParticularly, utilizing IGDM on the CIFAR-100 dataset improves the AutoAttack\naccuracy from 28.06% to 30.32% with the ResNet-18 architecture and from 26.18%\nto 29.32% with the MobileNetV2 architecture when integrated into the SOTA\nmethod without additional data augmentation.\n","authors":["Hongsin Lee","Seungju Cho","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2312.03286v2.pdf","comment":"ICLR 2025"},{"id":"http://arxiv.org/abs/2503.04268v1","updated":"2025-03-06T09:57:26Z","published":"2025-03-06T09:57:26Z","title":"ControlFill: Spatially Adjustable Image Inpainting from Prompt Learning","summary":" In this report, I present an inpainting framework named \\textit{ControlFill},\nwhich involves training two distinct prompts: one for generating plausible\nobjects within a designated mask (\\textit{creation}) and another for filling\nthe region by extending the background (\\textit{removal}). During the inference\nstage, these learned embeddings guide a diffusion network that operates without\nrequiring heavy text encoders. By adjusting the relative significance of the\ntwo prompts and employing classifier-free guidance, users can control the\nintensity of removal or creation. Furthermore, I introduce a method to\nspatially vary the intensity of guidance by assigning different scales to\nindividual pixels.\n","authors":["Boseong Jeon"],"pdf_url":"https://arxiv.org/pdf/2503.04268v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.18672v4","updated":"2025-03-06T09:55:41Z","published":"2025-01-30T18:51:54Z","title":"Drag Your Gaussian: Effective Drag-Based Editing with Score Distillation\n for 3D Gaussian Splatting","summary":" Recent advancements in 3D scene editing have been propelled by the rapid\ndevelopment of generative models. Existing methods typically utilize generative\nmodels to perform text-guided editing on 3D representations, such as 3D\nGaussian Splatting (3DGS). However, these methods are often limited to texture\nmodifications and fail when addressing geometric changes, such as editing a\ncharacter's head to turn around. Moreover, such methods lack accurate control\nover the spatial position of editing results, as language struggles to\nprecisely describe the extent of edits. To overcome these limitations, we\nintroduce DYG, an effective 3D drag-based editing method for 3D Gaussian\nSplatting. It enables users to conveniently specify the desired editing region\nand the desired dragging direction through the input of 3D masks and pairs of\ncontrol points, thereby enabling precise control over the extent of editing.\nDYG integrates the strengths of the implicit triplane representation to\nestablish the geometric scaffold of the editing results, effectively overcoming\nsuboptimal editing outcomes caused by the sparsity of 3DGS in the desired\nediting regions. Additionally, we incorporate a drag-based Latent Diffusion\nModel into our method through the proposed Drag-SDS loss function, enabling\nflexible, multi-view consistent, and fine-grained editing. Extensive\nexperiments demonstrate that DYG conducts effective drag-based editing guided\nby control point prompts, surpassing other baselines in terms of editing effect\nand quality, both qualitatively and quantitatively. Visit our project page at\nhttps://quyans.github.io/Drag-Your-Gaussian.\n","authors":["Yansong Qu","Dian Chen","Xinyang Li","Xiaofan Li","Shengchuan Zhang","Liujuan Cao","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2501.18672v4.pdf","comment":"Visit our project page at https://quyans.github.io/Drag-Your-Gaussian"},{"id":"http://arxiv.org/abs/2405.14736v2","updated":"2025-03-06T09:52:43Z","published":"2024-05-23T16:02:30Z","title":"GIFT: Unlocking Full Potential of Labels in Distilled Dataset at\n Near-zero Cost","summary":" Recent advancements in dataset distillation have demonstrated the significant\nbenefits of employing soft labels generated by pre-trained teacher models. In\nthis paper, we introduce a novel perspective by emphasizing the full\nutilization of labels. We first conduct a comprehensive comparison of various\nloss functions for soft label utilization in dataset distillation, revealing\nthat the model trained on the synthetic dataset exhibits high sensitivity to\nthe choice of loss function for soft label utilization. This finding highlights\nthe necessity of a universal loss function for training models on synthetic\ndatasets. Building on these insights, we introduce an extremely simple yet\nsurprisingly effective plug-and-play approach, GIFT, which encompasses soft\nlabel refinement and a cosine similarity-based loss function to efficiently\nleverage full label information. Extensive experiments indicate that GIFT\nconsistently enhances state-of-the-art dataset distillation methods across\nvarious dataset scales, without incurring additional computational costs.\nImportantly, GIFT significantly enhances cross-optimizer generalization, an\narea previously overlooked. For instance, on ImageNet-1K with IPC = 10, GIFT\nenhances the state-of-the-art method RDED by 30.8% in cross-optimizer\ngeneralization. Our code is available at https://github.com/LINs-lab/GIFT.\n","authors":["Xinyi Shang","Peng Sun","Tao Lin"],"pdf_url":"https://arxiv.org/pdf/2405.14736v2.pdf","comment":"https://github.com/LINs-lab/GIFT"},{"id":"http://arxiv.org/abs/2503.04258v1","updated":"2025-03-06T09:39:36Z","published":"2025-03-06T09:39:36Z","title":"TAIL: Text-Audio Incremental Learning","summary":" Many studies combine text and audio to capture multi-modal information but\nthey overlook the model's generalization ability on new datasets. Introducing\nnew datasets may affect the feature space of the original dataset, leading to\ncatastrophic forgetting. Meanwhile, large model parameters can significantly\nimpact training performance. To address these limitations, we introduce a novel\ntask called Text-Audio Incremental Learning (TAIL) task for text-audio\nretrieval, and propose a new method, PTAT, Prompt Tuning for Audio-Text\nincremental learning. This method utilizes prompt tuning to optimize the model\nparameters while incorporating an audio-text similarity and feature\ndistillation module to effectively mitigate catastrophic forgetting. We\nbenchmark our method and previous incremental learning methods on AudioCaps,\nClotho, BBC Sound Effects and Audioset datasets, and our method outperforms\nprevious methods significantly, particularly demonstrating stronger resistance\nto forgetting on older datasets. Compared to the full-parameters Finetune\n(Sequential) method, our model only requires 2.42\\% of its parameters,\nachieving 4.46\\% higher performance.\n","authors":["Yingfei Sun","Xu Gu","Wei Ji","Hanbin Zhao","Hao Fei","Yifang Yin","Roger Zimmermann"],"pdf_url":"https://arxiv.org/pdf/2503.04258v1.pdf","comment":"4 figures, 5 tables"},{"id":"http://arxiv.org/abs/2503.04257v1","updated":"2025-03-06T09:39:09Z","published":"2025-03-06T09:39:09Z","title":"How to Move Your Dragon: Text-to-Motion Synthesis for Large-Vocabulary\n Objects","summary":" Motion synthesis for diverse object categories holds great potential for 3D\ncontent creation but remains underexplored due to two key challenges: (1) the\nlack of comprehensive motion datasets that include a wide range of high-quality\nmotions and annotations, and (2) the absence of methods capable of handling\nheterogeneous skeletal templates from diverse objects. To address these\nchallenges, we contribute the following: First, we augment the Truebones Zoo\ndataset, a high-quality animal motion dataset covering over 70 species, by\nannotating it with detailed text descriptions, making it suitable for\ntext-based motion synthesis. Second, we introduce rig augmentation techniques\nthat generate diverse motion data while preserving consistent dynamics,\nenabling models to adapt to various skeletal configurations. Finally, we\nredesign existing motion diffusion models to dynamically adapt to arbitrary\nskeletal templates, enabling motion synthesis for a diverse range of objects\nwith varying structures. Experiments show that our method learns to generate\nhigh-fidelity motions from textual descriptions for diverse and even unseen\nobjects, setting a strong foundation for motion synthesis across diverse object\ncategories and skeletal templates. Qualitative results are available on this\nlink: t2m4lvo.github.io\n","authors":["Wonkwang Lee","Jongwon Jeong","Taehong Moon","Hyeon-Jong Kim","Jaehyeon Kim","Gunhee Kim","Byeong-Uk Lee"],"pdf_url":"https://arxiv.org/pdf/2503.04257v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04250v1","updated":"2025-03-06T09:33:46Z","published":"2025-03-06T09:33:46Z","title":"An Egocentric Vision-Language Model based Portable Real-time Smart\n Assistant","summary":" We present Vinci, a vision-language system designed to provide real-time,\ncomprehensive AI assistance on portable devices. At its core, Vinci leverages\nEgoVideo-VL, a novel model that integrates an egocentric vision foundation\nmodel with a large language model (LLM), enabling advanced functionalities such\nas scene understanding, temporal grounding, video summarization, and future\nplanning. To enhance its utility, Vinci incorporates a memory module for\nprocessing long video streams in real time while retaining contextual history,\na generation module for producing visual action demonstrations, and a retrieval\nmodule that bridges egocentric and third-person perspectives to provide\nrelevant how-to videos for skill acquisition. Unlike existing systems that\noften depend on specialized hardware, Vinci is hardware-agnostic, supporting\ndeployment across a wide range of devices, including smartphones and wearable\ncameras. In our experiments, we first demonstrate the superior performance of\nEgoVideo-VL on multiple public benchmarks, showcasing its vision-language\nreasoning and contextual understanding capabilities. We then conduct a series\nof user studies to evaluate the real-world effectiveness of Vinci, highlighting\nits adaptability and usability in diverse scenarios. We hope Vinci can\nestablish a new framework for portable, real-time egocentric AI systems,\nempowering users with contextual and actionable insights. Including the\nfrontend, backend, and models, all codes of Vinci are available at\nhttps://github.com/OpenGVLab/vinci.\n","authors":["Yifei Huang","Jilan Xu","Baoqi Pei","Yuping He","Guo Chen","Mingfang Zhang","Lijin Yang","Zheng Nie","Jinyao Liu","Guoshun Fan","Dechen Lin","Fang Fang","Kunpeng Li","Chang Yuan","Xinyuan Chen","Yaohui Wang","Yali Wang","Yu Qiao","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2503.04250v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.07481v5","updated":"2025-03-06T09:26:33Z","published":"2024-12-10T13:03:42Z","title":"Manta: Enhancing Mamba for Few-Shot Action Recognition of Long\n Sub-Sequence","summary":" In few-shot action recognition (FSAR), long sub-sequences of video naturally\nexpress entire actions more effectively. However, the high computational\ncomplexity of mainstream Transformer-based methods limits their application.\nRecent Mamba demonstrates efficiency in modeling long sequences, but directly\napplying Mamba to FSAR overlooks the importance of local feature modeling and\nalignment. Moreover, long sub-sequences within the same class accumulate\nintra-class variance, which adversely impacts FSAR performance. To solve these\nchallenges, we propose a Matryoshka MAmba and CoNtrasTive LeArning framework\n(Manta). Firstly, the Matryoshka Mamba introduces multiple Inner Modules to\nenhance local feature representation, rather than directly modeling global\nfeatures. An Outer Module captures dependencies of timeline between these local\nfeatures for implicit temporal alignment. Secondly, a hybrid contrastive\nlearning paradigm, combining both supervised and unsupervised methods, is\ndesigned to mitigate the negative effects of intra-class variance accumulation.\nThe Matryoshka Mamba and the hybrid contrastive learning paradigm operate in\ntwo parallel branches within Manta, enhancing Mamba for FSAR of long\nsub-sequence. Manta achieves new state-of-the-art performance on prominent\nbenchmarks, including SSv2, Kinetics, UCF101, and HMDB51. Extensive empirical\nstudies prove that Manta significantly improves FSAR of long sub-sequence from\nmultiple perspectives.\n","authors":["Wenbo Huang","Jinghui Zhang","Guang Li","Lei Zhang","Shuoyuan Wang","Fang Dong","Jiahui Jin","Takahiro Ogawa","Miki Haseyama"],"pdf_url":"https://arxiv.org/pdf/2412.07481v5.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2408.09110v3","updated":"2025-03-06T09:26:00Z","published":"2024-08-17T06:24:43Z","title":"Locate Anything on Earth: Advancing Open-Vocabulary Object Detection for\n Remote Sensing Community","summary":" Object detection, particularly open-vocabulary object detection, plays a\ncrucial role in Earth sciences, such as environmental monitoring, natural\ndisaster assessment, and land-use planning. However, existing open-vocabulary\ndetectors, primarily trained on natural-world images, struggle to generalize to\nremote sensing images due to a significant data domain gap. Thus, this paper\naims to advance the development of open-vocabulary object detection in remote\nsensing community. To achieve this, we first reformulate the task as Locate\nAnything on Earth (LAE) with the goal of detecting any novel concepts on Earth.\nWe then developed the LAE-Label Engine which collects, auto-annotates, and\nunifies up to 10 remote sensing datasets creating the LAE-1M - the first\nlarge-scale remote sensing object detection dataset with broad category\ncoverage. Using the LAE-1M, we further propose and train the novel LAE-DINO\nModel, the first open-vocabulary foundation object detector for the LAE task,\nfeaturing Dynamic Vocabulary Construction (DVC) and Visual-Guided Text Prompt\nLearning (VisGT) modules. DVC dynamically constructs vocabulary for each\ntraining batch, while VisGT maps visual features to semantic space, enhancing\ntext features. We comprehensively conduct experiments on established remote\nsensing benchmark DIOR, DOTAv2.0, as well as our newly introduced 80-class\nLAE-80C benchmark. Results demonstrate the advantages of the LAE-1M dataset and\nthe effectiveness of the LAE-DINO method.\n","authors":["Jiancheng Pan","Yanxing Liu","Yuqian Fu","Muyuan Ma","Jiahao Li","Danda Pani Paudel","Luc Van Gool","Xiaomeng Huang"],"pdf_url":"https://arxiv.org/pdf/2408.09110v3.pdf","comment":"15 pages, 11 figures"},{"id":"http://arxiv.org/abs/2503.00168v2","updated":"2025-03-06T09:23:35Z","published":"2025-02-28T20:30:56Z","title":"SSL4EO-S12 v1.1: A Multimodal, Multiseasonal Dataset for Pretraining,\n Updated","summary":" This technical report presents SSL4EO-S12 v1.1, a multimodal, multitemporal\nEarth Observation dataset designed for pretraining large-scale foundation\nmodels. Building on the success of SSL4EO-S12 v1.0, the new version addresses\nthe previous challenges of data misalignment and a limited data structure for\nlow-barrier, analysis-ready EO processing. SSL4EO-S12 v1.1 covers the world's\n10,000 largest cities and its surroundings within a 50 km radius across four\nseasons, resulting in a diverse collection of nearly one million patches.\nSSL4EO-S12 v1.1 packages the data in Zarr file format for cloud-efficient\nloading and representation of meta-information such as including cloud masks\nand geolocation. Released under the CC-BY-4.0 license, SSL4EO-S12 v1.1\nfacilitates open research and provides a robust foundation for future\nadvancements in self-supervised learning and geospatial analysis. The dataset\nis available online through https://datapub.fz-juelich.de/ssl4eo-s12, and we\nprovided additional resources at https://github.com/DLR-MF-DAS/SSL4EO-S12-v1.1.\n","authors":["Benedikt Blumenstiel","Nassim Ait Ali Braham","Conrad M Albrecht","Stefano Maurogiovanni","Paolo Fraccaro"],"pdf_url":"https://arxiv.org/pdf/2503.00168v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04235v1","updated":"2025-03-06T09:15:13Z","published":"2025-03-06T09:15:13Z","title":"Geometry-Constrained Monocular Scale Estimation Using Semantic\n Segmentation for Dynamic Scenes","summary":" Monocular visual localization plays a pivotal role in advanced driver\nassistance systems and autonomous driving by estimating a vehicle's ego-motion\nfrom a single pinhole camera. Nevertheless, conventional monocular visual\nodometry encoun-ters challenges in scale estimation due to the absence of depth\ninformation during projection. Previous methodologies, whether rooted in\nphysical constraints or deep learning paradigms, con-tend with issues related\nto computational complexity and the management of dynamic objects. This study\nextends our prior research, presenting innovative strategies for ego-motion\nestima-tion and the selection of ground points. Striving for a nuanced\nequilibrium between computational efficiency and precision, we propose a hybrid\nmethod that leverages the SegNeXt model for real-time applications,\nencompassing both ego-motion estimation and ground point selection. Our\nmethodology incorporates dy-namic object masks to eliminate unstable features\nand employs ground plane masks for meticulous triangulation. Furthermore, we\nexploit Geometry-constraint to delineate road regions for scale recovery. The\nintegration of this approach with the mo-nocular version of ORB-SLAM3\nculminates in the accurate esti-mation of a road model, a pivotal component in\nour scale recov-ery process. Rigorous experiments, conducted on the KITTI\nda-taset, systematically compare our method with existing monocu-lar visual\nodometry algorithms and contemporary scale recovery methodologies. The results\nundeniably confirm the superior ef-fectiveness of our approach, surpassing\nstate-of-the-art visual odometry algorithms. Our source code is available at\nhttps://git hub.com/bFr0zNq/MVOSegScale.\n","authors":["Hui Zhang","Zhiyang Wu","Qianqian Shangguan","Kang An"],"pdf_url":"https://arxiv.org/pdf/2503.04235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07076v2","updated":"2025-03-06T09:13:28Z","published":"2024-11-11T15:51:48Z","title":"StoryTeller: Improving Long Video Description through Global\n Audio-Visual Character Identification","summary":" Existing large vision-language models (LVLMs) are largely limited to\nprocessing short, seconds-long videos and struggle with generating coherent\ndescriptions for extended video spanning minutes or more. Long video\ndescription introduces new challenges, such as consistent character\nidentification and plot-level descriptions incorporating both visual and audio\ninformation. To address these, we figure out audio-visual character\nidentification, matching character names to each dialogue, as a key factor. We\npropose StoryTeller, a system for generating dense descriptions of long videos,\nincorporating both low-level visual concepts and high-level plot information.\nStoryTeller uses a multimodal large language model that integrates visual,\naudio, and text modalities to perform audio-visual character identification on\nminute-long video clips. The results are then fed into a LVLM to enhance\nconsistency of video description. We validate our approach on movie description\ntasks and introduce MovieStory101, a dataset with dense descriptions for\nthree-minute movie clips. To evaluate long video descriptions, we create\nStoryQA, a large set of multiple-choice questions for MovieStory101 test set.\nWe assess descriptions by inputting them into GPT-4 to answer these questions,\nusing accuracy as an automatic evaluation metric. Experiments show that\nStoryTeller outperforms all open and closed-source baselines on StoryQA,\nachieving 9.5% higher accuracy than the strongest baseline, Gemini-1.5-pro, and\ndemonstrating a +15.56% advantage in human side-by-side evaluations.\nAdditionally, incorporating audio-visual character identification from\nStoryTeller improves the performance of all video description models, with\nGemini-1.5-pro and GPT-4o showing relative improvement of 5.5% and 13.0%,\nrespectively, in accuracy on StoryQA.\n","authors":["Yichen He","Yuan Lin","Jianchao Wu","Hanchong Zhang","Yuchen Zhang","Ruicheng Le"],"pdf_url":"https://arxiv.org/pdf/2411.07076v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09263v3","updated":"2025-03-06T09:10:07Z","published":"2024-11-14T08:02:14Z","title":"Rethinking Weight-Averaged Model-merging","summary":" Model-merging has emerged as a powerful approach in deep learning, capable of\nenhancing model performance without any training. However, the underlying\nmechanisms that explain its effectiveness remain largely unexplored. In this\npaper, we investigate this technique from three novel perspectives to\nempirically provide deeper insights into why and how weight-averaged\nmodel-merging works: (1) we examine the intrinsic patterns captured by the\nlearning of the model weights, through the visualizations of their patterns on\nseveral datasets, showing that these weights often encode structured and\ninterpretable patterns and that is the essential why model-merging can work;\n(2) we mathematically and empirically investigate model ensemble merging\nstrategies based on averaging on weights versus averaging on features,\nproviding detailed analyses across diverse architectures and datasets; and (3)\nwe explore the impact on model-merging prediction stability in terms of\nchanging the parameter magnitude, revealing insights into the way of weight\naveraging works as regularization by showing the robustness across different\nparameter scales. Our findings shed light on the \"black box\" of weight-averaged\nmodel-merging, offering valuable insights and practical recommendations that\nadvance the model-merging process. The code is available at\nhttps://github.com/billhhh/Rethink-Merge.\n","authors":["Hu Wang","Congbo Ma","Ibrahim Almakky","Ian Reid","Gustavo Carneiro","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2411.09263v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04229v1","updated":"2025-03-06T09:09:18Z","published":"2025-03-06T09:09:18Z","title":"Synthetic Data is an Elegant GIFT for Continual Vision-Language Models","summary":" Pre-trained Vision-Language Models (VLMs) require Continual Learning (CL) to\nefficiently update their knowledge and adapt to various downstream tasks\nwithout retraining from scratch. However, for VLMs, in addition to the loss of\nknowledge previously learned from downstream tasks, pre-training knowledge is\nalso corrupted during continual fine-tuning. This issue is exacerbated by the\nunavailability of original pre-training data, leaving VLM's generalization\nability degrading. In this paper, we propose GIFT, a novel continual\nfine-tuning approach that utilizes synthetic data to overcome catastrophic\nforgetting in VLMs. Taking advantage of recent advances in text-to-image\nsynthesis, we employ a pre-trained diffusion model to recreate both\npre-training and learned downstream task data. In this way, the VLM can revisit\nprevious knowledge through distillation on matching diffusion-generated images\nand corresponding text prompts. Leveraging the broad distribution and high\nalignment between synthetic image-text pairs in VLM's feature space, we propose\na contrastive distillation loss along with an image-text alignment constraint.\nTo further combat in-distribution overfitting and enhance distillation\nperformance with limited amount of generated data, we incorporate adaptive\nweight consolidation, utilizing Fisher information from these synthetic\nimage-text pairs and achieving a better stability-plasticity balance. Extensive\nexperiments demonstrate that our method consistently outperforms previous\nstate-of-the-art approaches across various settings.\n","authors":["Bin Wu","Wuxuan Shi","Jinqiao Wang","Mang Ye"],"pdf_url":"https://arxiv.org/pdf/2503.04229v1.pdf","comment":"This work is accepted by CVPR 2025. Modifications may be performed"},{"id":"http://arxiv.org/abs/2503.04223v1","updated":"2025-03-06T09:06:06Z","published":"2025-03-06T09:06:06Z","title":"Spiking Meets Attention: Efficient Remote Sensing Image Super-Resolution\n with Attention Spiking Neural Networks","summary":" Spiking neural networks (SNNs) are emerging as a promising alternative to\ntraditional artificial neural networks (ANNs), offering biological plausibility\nand energy efficiency. Despite these merits, SNNs are frequently hampered by\nlimited capacity and insufficient representation power, yet remain\nunderexplored in remote sensing super-resolution (SR) tasks. In this paper, we\nfirst observe that spiking signals exhibit drastic intensity variations across\ndiverse textures, highlighting an active learning state of the neurons. This\nobservation motivates us to apply SNNs for efficient SR of RSIs. Inspired by\nthe success of attention mechanisms in representing salient information, we\ndevise the spiking attention block (SAB), a concise yet effective component\nthat optimizes membrane potentials through inferred attention weights, which,\nin turn, regulates spiking activity for superior feature representation. Our\nkey contributions include: 1) we bridge the independent modulation between\ntemporal and channel dimensions, facilitating joint feature correlation\nlearning, and 2) we access the global self-similar patterns in large-scale\nremote sensing imagery to infer spatial attention weights, incorporating\neffective priors for realistic and faithful reconstruction. Building upon SAB,\nwe proposed SpikeSR, which achieves state-of-the-art performance across various\nremote sensing benchmarks such as AID, DOTA, and DIOR, while maintaining high\ncomputational efficiency. The code of SpikeSR will be available upon paper\nacceptance.\n","authors":["Yi Xiao","Qiangqiang Yuan","Kui Jiang","Qiang Zhang","Tingting Zheng","Chia-Wen Lin","Liangpei Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.04223v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14153v3","updated":"2025-03-06T09:00:18Z","published":"2024-08-26T09:55:34Z","title":"Explaining Caption-Image Interactions in CLIP models with Second-Order\n Attributions","summary":" Dual encoder architectures like CLIP models map two types of inputs into a\nshared embedding space and predict similarities between them. Despite their\nsuccess, it is, however, not understood how these models compare their two\ninputs. Common first-order feature-attribution methods can only provide limited\ninsights into dual-encoders since their predictions depend on\nfeature-interactions rather than on individual features. In this paper, we\nfirst derive a second-order method enabling the attribution of predictions by\nany differentiable dual encoder onto feature-interactions between its inputs.\nSecond, we apply our method to CLIP models and show that they learn\nfine-grained correspondences between parts of captions and regions in images.\nThey match objects across input modes also account for mismatches. This\nvisual-linguistic grounding ability, however, varies heavily between object\nclasses and exhibits pronounced out-of-domain effects. We can identify\nindividual errors as well as systematic failure categories including object\ncoverage, unusual scenes and correlated contexts.\n","authors":["Lucas Möller","Pascal Tilli","Ngoc Thang Vu","Sebastian Padó"],"pdf_url":"https://arxiv.org/pdf/2408.14153v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04215v1","updated":"2025-03-06T08:52:29Z","published":"2025-03-06T08:52:29Z","title":"Energy-Guided Optimization for Personalized Image Editing with\n Pretrained Text-to-Image Diffusion Models","summary":" The rapid advancement of pretrained text-driven diffusion models has\nsignificantly enriched applications in image generation and editing. However,\nas the demand for personalized content editing increases, new challenges emerge\nespecially when dealing with arbitrary objects and complex scenes. Existing\nmethods usually mistakes mask as the object shape prior, which struggle to\nachieve a seamless integration result. The mostly used inversion noise\ninitialization also hinders the identity consistency towards the target object.\nTo address these challenges, we propose a novel training-free framework that\nformulates personalized content editing as the optimization of edited images in\nthe latent space, using diffusion models as the energy function guidance\nconditioned by reference text-image pairs. A coarse-to-fine strategy is\nproposed that employs text energy guidance at the early stage to achieve a\nnatural transition toward the target class and uses point-to-point\nfeature-level image energy guidance to perform fine-grained appearance\nalignment with the target object. Additionally, we introduce the latent space\ncontent composition to enhance overall identity consistency with the target.\nExtensive experiments demonstrate that our method excels in object replacement\neven with a large domain gap, highlighting its potential for high-quality,\npersonalized image editing.\n","authors":["Rui Jiang","Xinghe Fu","Guangcong Zheng","Teng Li","Taiping Yao","Xi Li"],"pdf_url":"https://arxiv.org/pdf/2503.04215v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07155v3","updated":"2025-03-06T08:51:28Z","published":"2024-05-12T04:18:10Z","title":"Meta-Learned Modality-Weighted Knowledge Distillation for Robust\n Multi-Modal Learning with Missing Data","summary":" In multi-modal learning, some modalities are more influential than others,\nand their absence can have a significant impact on classification/segmentation\naccuracy. Addressing this challenge, we propose a novel approach called\nMeta-learned Modality-weighted Knowledge Distillation (MetaKD), which enables\nmulti-modal models to maintain high accuracy even when key modalities are\nmissing. MetaKD adaptively estimates the importance weight of each modality\nthrough a meta-learning process. These learned importance weights guide a\npairwise modality-weighted knowledge distillation process, allowing\nhigh-importance modalities to transfer knowledge to lower-importance ones,\nresulting in robust performance despite missing inputs. Unlike previous methods\nin the field, which are often task-specific and require significant\nmodifications, our approach is designed to work in multiple tasks (e.g.,\nsegmentation and classification) with minimal adaptation. Experimental results\non five prevalent datasets, including three Brain Tumor Segmentation datasets\n(BraTS2018, BraTS2019 and BraTS2020), the Alzheimer's Disease Neuroimaging\nInitiative (ADNI) classification dataset and the Audiovision-MNIST\nclassification dataset, demonstrate the proposed model is able to outperform\nthe compared models by a large margin. The code is available at\nhttps://github.com/billhhh/MetaKD.\n","authors":["Hu Wang","Salma Hassan","Yuyuan Liu","Congbo Ma","Yuanhong Chen","Yutong Xie","Mostafa Salem","Yu Tian","Jodie Avery","Louise Hull","Ian Reid","Mohammad Yaqub","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2405.07155v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04207v1","updated":"2025-03-06T08:31:40Z","published":"2025-03-06T08:31:40Z","title":"Bridging the Vision-Brain Gap with an Uncertainty-Aware Blur Prior","summary":" Can our brain signals faithfully reflect the original visual stimuli, even\nincluding high-frequency details? Although human perceptual and cognitive\ncapacities enable us to process and remember visual information, these\nabilities are constrained by several factors, such as limited attentional\nresources and the finite capacity of visual memory. When visual stimuli are\nprocessed by human visual system into brain signals, some information is\ninevitably lost, leading to a discrepancy known as the \\textbf{System GAP}.\nAdditionally, perceptual and cognitive dynamics, along with technical noise in\nsignal acquisition, degrade the fidelity of brain signals relative to the\nvisual stimuli, known as the \\textbf{Random GAP}. When encoded brain\nrepresentations are directly aligned with the corresponding pretrained image\nfeatures, the System GAP and Random GAP between paired data challenge the\nmodel, requiring it to bridge these gaps. However, in the context of limited\npaired data, these gaps are difficult for the model to learn, leading to\noverfitting and poor generalization to new data. To address these GAPs, we\npropose a simple yet effective approach called the \\textbf{Uncertainty-aware\nBlur Prior (UBP)}. It estimates the uncertainty within the paired data,\nreflecting the mismatch between brain signals and visual stimuli. Based on this\nuncertainty, UBP dynamically blurs the high-frequency details of the original\nimages, reducing the impact of the mismatch and improving alignment. Our method\nachieves a top-1 accuracy of \\textbf{50.9\\%} and a top-5 accuracy of\n\\textbf{79.7\\%} on the zero-shot brain-to-image retrieval task, surpassing\nprevious state-of-the-art methods by margins of \\textbf{13.7\\%} and\n\\textbf{9.8\\%}, respectively. Code is available at\n\\href{https://github.com/HaitaoWuTJU/Uncertainty-aware-Blur-Prior}{GitHub}.\n","authors":["Haitao Wu","Qing Li","Changqing Zhang","Zhen He","Xiaomin Ying"],"pdf_url":"https://arxiv.org/pdf/2503.04207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04205v1","updated":"2025-03-06T08:30:33Z","published":"2025-03-06T08:30:33Z","title":"Learning 3D Medical Image Models From Brain Functional Connectivity\n Network Supervision For Mental Disorder Diagnosis","summary":" In MRI-based mental disorder diagnosis, most previous studies focus on\nfunctional connectivity network (FCN) derived from functional MRI (fMRI).\nHowever, the small size of annotated fMRI datasets restricts its wide\napplication. Meanwhile, structural MRIs (sMRIs), such as 3D T1-weighted (T1w)\nMRI, which are commonly used and readily accessible in clinical settings, are\noften overlooked. To integrate the complementary information from both function\nand structure for improved diagnostic accuracy, we propose CINP (Contrastive\nImage-Network Pre-training), a framework that employs contrastive learning\nbetween sMRI and FCN. During pre-training, we incorporate masked image modeling\nand network-image matching to enhance visual representation learning and\nmodality alignment. Since the CINP facilitates knowledge transfer from FCN to\nsMRI, we introduce network prompting. It utilizes only sMRI from suspected\npatients and a small amount of FCNs from different patient classes for\ndiagnosing mental disorders, which is practical in real-world clinical\nscenario. The competitive performance on three mental disorder diagnosis tasks\ndemonstrate the effectiveness of the CINP in integrating multimodal MRI\ninformation, as well as the potential of incorporating sMRI into clinical\ndiagnosis using network prompting.\n","authors":["Xingcan Hu","Wei Wang","Li Xiao"],"pdf_url":"https://arxiv.org/pdf/2503.04205v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04204v1","updated":"2025-03-06T08:30:18Z","published":"2025-03-06T08:30:18Z","title":"FUSE: First-Order and Second-Order Unified SynthEsis in Stochastic\n Optimization","summary":" Stochastic optimization methods have actively been playing a critical role in\nmodern machine learning algorithms to deliver decent performance. While\nnumerous works have proposed and developed diverse approaches, first-order and\nsecond-order methods are in entirely different situations. The former is\nsignificantly pivotal and dominating in emerging deep learning but only leads\nconvergence to a stationary point. However, second-order methods are less\npopular due to their computational intensity in large-dimensional problems.\nThis paper presents a novel method that leverages both the first-order and\nsecond-order methods in a unified algorithmic framework, termed FUSE, from\nwhich a practical version (PV) is derived accordingly. FUSE-PV stands as a\nsimple yet efficient optimization method involving a switch-over between first\nand second orders. Additionally, we develop different criteria that determine\nwhen to switch. FUSE-PV has provably shown a smaller computational complexity\nthan SGD and Adam. To validate our proposed scheme, we present an ablation\nstudy on several simple test functions and show a comparison with baselines for\nbenchmark datasets.\n","authors":["Zhanhong Jiang","Md Zahid Hasan","Aditya Balu","Joshua R. Waite","Genyi Huang","Soumik Sarkar"],"pdf_url":"https://arxiv.org/pdf/2503.04204v1.pdf","comment":"6 pages, 7 figures"},{"id":"http://arxiv.org/abs/2411.13056v2","updated":"2025-03-06T08:28:09Z","published":"2024-11-20T06:08:21Z","title":"Efficient Masked AutoEncoder for Video Object Counting and A Large-Scale\n Benchmark","summary":" The dynamic imbalance of the fore-background is a major challenge in video\nobject counting, which is usually caused by the sparsity of target objects.\nThis remains understudied in existing works and often leads to severe\nunder-/over-prediction errors. To tackle this issue in video object counting,\nwe propose a density-embedded Efficient Masked Autoencoder Counting (E-MAC)\nframework in this paper. To empower the model's representation ability on\ndensity regression, we develop a new $\\mathtt{D}$ensity-$\\mathtt{E}$mbedded\n$\\mathtt{M}$asked m$\\mathtt{O}$deling ($\\mathtt{DEMO}$) method, which first\ntakes the density map as an auxiliary modality to perform multimodal\nself-representation learning for image and density map. Although\n$\\mathtt{DEMO}$ contributes to effective cross-modal regression guidance, it\nalso brings in redundant background information, making it difficult to focus\non the foreground regions. To handle this dilemma, we propose an efficient\nspatial adaptive masking derived from density maps to boost efficiency.\nMeanwhile, we employ an optical flow-based temporal collaborative fusion\nstrategy to effectively capture the dynamic variations across frames, aligning\nfeatures to derive multi-frame density residuals. The counting accuracy of the\ncurrent frame is boosted by harnessing the information from adjacent frames. In\naddition, considering that most existing datasets are limited to human-centric\nscenarios, we first propose a large video bird counting dataset, DroneBird, in\nnatural scenarios for migratory bird protection. Extensive experiments on three\ncrowd datasets and our \\textit{DroneBird} validate our superiority against the\ncounterparts. The code and dataset are available.\n","authors":["Bing Cao","Quanhao Lu","Jiekang Feng","Qilong Wang","Qinghua Hu","Pengfei Zhu"],"pdf_url":"https://arxiv.org/pdf/2411.13056v2.pdf","comment":"ICLR25"},{"id":"http://arxiv.org/abs/2503.04199v1","updated":"2025-03-06T08:27:51Z","published":"2025-03-06T08:27:51Z","title":"MASTER: Multimodal Segmentation with Text Prompts","summary":" RGB-Thermal fusion is a potential solution for various weather and light\nconditions in challenging scenarios. However, plenty of studies focus on\ndesigning complex modules to fuse different modalities. With the widespread\napplication of large language models (LLMs), valuable information can be more\neffectively extracted from natural language. Therefore, we aim to leverage the\nadvantages of large language models to design a structurally simple and highly\nadaptable multimodal fusion model architecture. We proposed MultimodAl\nSegmentation with TExt PRompts (MASTER) architecture, which integrates LLM into\nthe fusion of RGB-Thermal multimodal data and allows complex query text to\nparticipate in the fusion process. Our model utilizes a dual-path structure to\nextract information from different modalities of images. Additionally, we\nemploy LLM as the core module for multimodal fusion, enabling the model to\ngenerate learnable codebook tokens from RGB, thermal images, and textual\ninformation. A lightweight image decoder is used to obtain semantic\nsegmentation results. The proposed MASTER performs exceptionally well in\nbenchmark tests across various automated driving scenarios, yielding promising\nresults.\n","authors":["Fuyang Liu","Shun Lu","Jilin Mei","Yu Hu"],"pdf_url":"https://arxiv.org/pdf/2503.04199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04191v1","updated":"2025-03-06T08:06:03Z","published":"2025-03-06T08:06:03Z","title":"Conformal forecasting for surgical instrument trajectory","summary":" Forecasting surgical instrument trajectories and predicting the next surgical\naction recently started to attract attention from the research community. Both\nthese tasks are crucial for automation and assistance in endoscopy surgery.\nGiven the safety-critical nature of these tasks, reliable uncertainty\nquantification is essential. Conformal prediction is a fast-growing and widely\nrecognized framework for uncertainty estimation in machine learning and\ncomputer vision, offering distribution-free, theoretically valid prediction\nintervals. In this work, we explore the application of standard conformal\nprediction and conformalized quantile regression to estimate uncertainty in\nforecasting surgical instrument motion, i.e., predicting direction and\nmagnitude of surgical instruments' future motion. We analyze and compare their\ncoverage and interval sizes, assessing the impact of multiple hypothesis\ntesting and correction methods. Additionally, we show how these techniques can\nbe employed to produce useful uncertainty heatmaps. To the best of our\nknowledge, this is the first study applying conformal prediction to surgical\nguidance, marking an initial step toward constructing principled prediction\nintervals with formal coverage guarantees in this domain.\n","authors":["Sara Sangalli","Gary Sarwin","Ertunc Erdil","Carlo Serra","Ender Konukoglu"],"pdf_url":"https://arxiv.org/pdf/2503.04191v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04171v1","updated":"2025-03-06T07:36:45Z","published":"2025-03-06T07:36:45Z","title":"DuCos: Duality Constrained Depth Super-Resolution via Foundation Model","summary":" We introduce DuCos, a novel depth super-resolution framework grounded in\nLagrangian duality theory, offering a flexible integration of multiple\nconstraints and reconstruction objectives to enhance accuracy and robustness.\nOur DuCos is the first to significantly improve generalization across diverse\nscenarios with foundation models as prompts. The prompt design consists of two\nkey components: Correlative Fusion (CF) and Gradient Regulation (GR). CF\nfacilitates precise geometric alignment and effective fusion between prompt and\ndepth features, while GR refines depth predictions by enforcing consistency\nwith sharp-edged depth maps derived from foundation models. Crucially, these\nprompts are seamlessly embedded into the Lagrangian constraint term, forming a\nsynergistic and principled framework. Extensive experiments demonstrate that\nDuCos outperforms existing state-of-the-art methods, achieving superior\naccuracy, robustness, and generalization. The source codes and pre-trained\nmodels will be publicly available.\n","authors":["Zhiqiang Yan","Zhengxue Wang","Haoye Dong","Jun Li","Jian Yang","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2503.04171v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04167v1","updated":"2025-03-06T07:29:33Z","published":"2025-03-06T07:29:33Z","title":"The Role of Visual Modality in Multimodal Mathematical Reasoning:\n Challenges and Insights","summary":" Recent research has increasingly focused on multimodal mathematical\nreasoning, particularly emphasizing the creation of relevant datasets and\nbenchmarks. Despite this, the role of visual information in reasoning has been\nunderexplored. Our findings show that existing multimodal mathematical models\nminimally leverage visual information, and model performance remains largely\nunaffected by changes to or removal of images in the dataset. We attribute this\nto the dominance of textual information and answer options that inadvertently\nguide the model to correct answers. To improve evaluation methods, we introduce\nthe HC-M3D dataset, specifically designed to require image reliance for\nproblem-solving and to challenge models with similar, yet distinct, images that\nchange the correct answer. In testing leading models, their failure to detect\nthese subtle visual differences suggests limitations in current visual\nperception capabilities. Additionally, we observe that the common approach of\nimproving general VQA capabilities by combining various types of image encoders\ndoes not contribute to math reasoning performance. This finding also presents a\nchallenge to enhancing visual reliance during math reasoning. Our benchmark and\ncode would be available at\n\\href{https://github.com/Yufang-Liu/visual_modality_role}{https://github.com/Yufang-Liu/visual\\_modality\\_role}.\n","authors":["Yufang Liu","Yao Du","Tao Ji","Jianing Wang","Yang Liu","Yuanbin Wu","Aimin Zhou","Mengdi Zhang","Xunliang Cai"],"pdf_url":"https://arxiv.org/pdf/2503.04167v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11505v4","updated":"2025-03-06T07:26:35Z","published":"2024-11-18T12:05:27Z","title":"LaVin-DiT: Large Vision Diffusion Transformer","summary":" This paper presents the Large Vision Diffusion Transformer (LaVin-DiT), a\nscalable and unified foundation model designed to tackle over 20 computer\nvision tasks in a generative framework. Unlike existing large vision models\ndirectly adapted from natural language processing architectures, which rely on\nless efficient autoregressive techniques and disrupt spatial relationships\nessential for vision data, LaVin-DiT introduces key innovations to optimize\ngenerative performance for vision tasks. First, to address the high\ndimensionality of visual data, we incorporate a spatial-temporal variational\nautoencoder that encodes data into a continuous latent space. Second, for\ngenerative modeling, we develop a joint diffusion transformer that\nprogressively produces vision outputs. Third, for unified multi-task training,\nin-context learning is implemented. Input-target pairs serve as task context,\nwhich guides the diffusion transformer to align outputs with specific tasks\nwithin the latent space. During inference, a task-specific context set and test\ndata as queries allow LaVin-DiT to generalize across tasks without fine-tuning.\nTrained on extensive vision datasets, the model is scaled from 0.1B to 3.4B\nparameters, demonstrating substantial scalability and state-of-the-art\nperformance across diverse vision tasks. This work introduces a novel pathway\nfor large vision foundation models, underscoring the promising potential of\ndiffusion transformers. The code and models are available.\n","authors":["Zhaoqing Wang","Xiaobo Xia","Runnan Chen","Dongdong Yu","Changhu Wang","Mingming Gong","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2411.11505v4.pdf","comment":"37 pages, 30 figures, 4 tables. Accepted by CVPR 2025"},{"id":"http://arxiv.org/abs/2503.04165v1","updated":"2025-03-06T07:25:43Z","published":"2025-03-06T07:25:43Z","title":"WeakSupCon: Weakly Supervised Contrastive Learning for Encoder\n Pre-training","summary":" Weakly supervised multiple instance learning (MIL) is a challenging task\ngiven that only bag-level labels are provided, while each bag typically\ncontains multiple instances. This topic has been extensively studied in\nhistopathological image analysis, where labels are usually available only at\nthe whole slide image (WSI) level, while each whole slide image can be divided\ninto thousands of small image patches for training. The dominant MIL approaches\ntake fixed patch features as inputs to address computational constraints and\nensure model stability. These features are commonly generated by encoders\npre-trained on ImageNet, foundation encoders pre-trained on large datasets, or\nthrough self-supervised learning on local datasets. While the self-supervised\nencoder pre-training on the same dataset as downstream MIL tasks helps mitigate\ndomain shift and generate better features, the bag-level labels are not\nutilized during the process, and the features of patches from different\ncategories may cluster together, reducing classification performance on MIL\ntasks. Recently, pre-training with supervised contrastive learning (SupCon) has\ndemonstrated superior performance compared to self-supervised contrastive\nlearning and even end-to-end training on traditional image classification\ntasks. In this paper, we propose a novel encoder pre-training method for\ndownstream MIL tasks called Weakly Supervised Contrastive Learning (WeakSupCon)\nthat utilizes bag-level labels. In our method, we employ multi-task learning\nand define distinct contrastive learning losses for samples with different bag\nlabels. Our experiments demonstrate that the features generated using\nWeakSupCon significantly enhance MIL classification performance compared to\nself-supervised approaches across three datasets.\n","authors":["Bodong Zhang","Hamid Manoochehri","Beatrice S. Knudsen","Tolga Tasdizen"],"pdf_url":"https://arxiv.org/pdf/2503.04165v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.01234v2","updated":"2025-03-06T07:11:32Z","published":"2025-03-03T06:57:54Z","title":"Self-Adaptive Gamma Context-Aware SSM-based Model for Metal Defect\n Detection","summary":" Metal defect detection is critical in industrial quality assurance, yet\nexisting methods struggle with grayscale variations and complex defect states,\nlimiting its robustness. To address these challenges, this paper proposes a\nSelf-Adaptive Gamma Context-Aware SSM-based model(GCM-DET). This advanced\ndetection framework integrating a Dynamic Gamma Correction (GC) module to\nenhance grayscale representation and optimize feature extraction for precise\ndefect reconstruction. A State-Space Search Management (SSM) architecture\ncaptures robust multi-scale features, effectively handling defects of varying\nshapes and scales. Focal Loss is employed to mitigate class imbalance and\nrefine detection accuracy. Additionally, the CD5-DET dataset is introduced,\nspecifically designed for port container maintenance, featuring significant\ngrayscale variations and intricate defect patterns. Experimental results\ndemonstrate that the proposed model achieves substantial improvements, with\nmAP@0.5 gains of 27.6\\%, 6.6\\%, and 2.6\\% on the CD5-DET, NEU-DET, and GC10-DET\ndatasets.\n","authors":["Sijin Sun","Ming Deng","Xingrui Yu","Xinyu Xi","Liangbin Zhao"],"pdf_url":"https://arxiv.org/pdf/2503.01234v2.pdf","comment":"19 pages, 9 figures, under review"},{"id":"http://arxiv.org/abs/2410.14595v2","updated":"2025-03-06T07:06:50Z","published":"2024-10-18T16:48:31Z","title":"DRACO-DehazeNet: An Efficient Image Dehazing Network Combining Detail\n Recovery and a Novel Contrastive Learning Paradigm","summary":" Image dehazing is crucial for clarifying images obscured by haze or fog, but\ncurrent learning-based approaches is dependent on large volumes of training\ndata and hence consumed significant computational power. Additionally, their\nperformance is often inadequate under non-uniform or heavy haze. To address\nthese challenges, we developed the Detail Recovery And Contrastive DehazeNet,\nwhich facilitates efficient and effective dehazing via a dense dilated inverted\nresidual block and an attention-based detail recovery network that tailors\nenhancements to specific dehazed scene contexts. A major innovation is its\nability to train effectively with limited data, achieved through a novel\nquadruplet loss-based contrastive dehazing paradigm. This approach distinctly\nseparates hazy and clear image features while also distinguish lower-quality\nand higher-quality dehazed images obtained from each sub-modules of our\nnetwork, thereby refining the dehazing process to a larger extent. Extensive\ntests on a variety of benchmarked haze datasets demonstrated the superiority of\nour approach. The code repository for this work is available at\nhttps://github.com/GreedYLearner1146/DRACO-DehazeNet.\n","authors":["Gao Yu Lee","Tanmoy Dam","Md Meftahul Ferdaus","Daniel Puiu Poenar","Vu Duong"],"pdf_url":"https://arxiv.org/pdf/2410.14595v2.pdf","comment":"Once the paper is accepted and published, the copyright will be\n transferred to the corresponding journal"},{"id":"http://arxiv.org/abs/2503.04154v1","updated":"2025-03-06T07:02:13Z","published":"2025-03-06T07:02:13Z","title":"CA-W3D: Leveraging Context-Aware Knowledge for Weakly Supervised\n Monocular 3D Detection","summary":" Weakly supervised monocular 3D detection, while less annotation-intensive,\noften struggles to capture the global context required for reliable 3D\nreasoning. Conventional label-efficient methods focus on object-centric\nfeatures, neglecting contextual semantic relationships that are critical in\ncomplex scenes. In this work, we propose a Context-Aware Weak Supervision for\nMonocular 3D object detection, namely CA-W3D, to address this limitation in a\ntwo-stage training paradigm. Specifically, we first introduce a pre-training\nstage employing Region-wise Object Contrastive Matching (ROCM), which aligns\nregional object embeddings derived from a trainable monocular 3D encoder and a\nfrozen open-vocabulary 2D visual grounding model. This alignment encourages the\nmonocular encoder to discriminate scene-specific attributes and acquire richer\ncontextual knowledge. In the second stage, we incorporate a pseudo-label\ntraining process with a Dual-to-One Distillation (D2OD) mechanism, which\neffectively transfers contextual priors into the monocular encoder while\npreserving spatial fidelity and maintaining computational efficiency during\ninference. Extensive experiments conducted on the public KITTI benchmark\ndemonstrate the effectiveness of our approach, surpassing the SoTA method over\nall metrics, highlighting the importance of contextual-aware knowledge in\nweakly-supervised monocular 3D detection.\n","authors":["Chupeng Liu","Runkai Zhao","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2503.04154v1.pdf","comment":"The paper includes 8 pages, 6 figures and 4 tables"},{"id":"http://arxiv.org/abs/2503.04151v1","updated":"2025-03-06T07:01:08Z","published":"2025-03-06T07:01:08Z","title":"Robust Multi-View Learning via Representation Fusion of Sample-Level\n Attention and Alignment of Simulated Perturbation","summary":" Recently, multi-view learning (MVL) has garnered significant attention due to\nits ability to fuse discriminative information from multiple views. However,\nreal-world multi-view datasets are often heterogeneous and imperfect, which\nusually makes MVL methods designed for specific combinations of views lack\napplication potential and limits their effectiveness. To address this issue, we\npropose a novel robust MVL method (namely RML) with simultaneous representation\nfusion and alignment. Specifically, we introduce a simple yet effective\nmulti-view transformer fusion network where we transform heterogeneous\nmulti-view data into homogeneous word embeddings, and then integrate multiple\nviews by the sample-level attention mechanism to obtain a fused representation.\nFurthermore, we propose a simulated perturbation based multi-view contrastive\nlearning framework that dynamically generates the noise and unusable\nperturbations for simulating imperfect data conditions. The simulated noisy and\nunusable data obtain two distinct fused representations, and we utilize\ncontrastive learning to align them for learning discriminative and robust\nrepresentations. Our RML is self-supervised and can also be applied for\ndownstream tasks as a regularization. In experiments, we employ it in\nunsupervised multi-view clustering, noise-label classification, and as a\nplug-and-play module for cross-modal hashing retrieval. Extensive comparison\nexperiments and ablation studies validate the effectiveness of RML.\n","authors":["Jie Xu","Na Zhao","Gang Niu","Masashi Sugiyama","Xiaofeng Zhu"],"pdf_url":"https://arxiv.org/pdf/2503.04151v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04014v3","updated":"2025-03-06T06:55:15Z","published":"2023-07-08T16:46:16Z","title":"Novel Pipeline for Diagnosing Acute Lymphoblastic Leukemia Sensitive to\n Related Biomarkers","summary":" Acute Lymphoblastic Leukemia (ALL) is one of the most common types of\nchildhood blood cancer. The quick start of the treatment process is critical to\nsaving the patient's life, and for this reason, early diagnosis of this disease\nis essential. Examining the blood smear images of these patients is one of the\nmethods used by expert doctors to diagnose this disease. Deep learning-based\nmethods have numerous applications in medical fields, as they have\nsignificantly advanced in recent years. ALL diagnosis is not an exception in\nthis field, and several machine learning-based methods for this problem have\nbeen proposed. In previous methods, high diagnostic accuracy was reported, but\nour work showed that this alone is not sufficient, as it can lead to models\ntaking shortcuts and not making meaningful decisions. This issue arises due to\nthe small size of medical training datasets. To address this, we constrained\nour model to follow a pipeline inspired by experts' work. We also demonstrated\nthat, since a judgement based on only one image is insufficient, redefining the\nproblem as a multiple-instance learning problem is necessary for achieving a\npractical result. Our model is the first to provide a solution to this problem\nin a multiple-instance learning setup. We introduced a novel pipeline for\ndiagnosing ALL that approximates the process used by hematologists, is\nsensitive to disease biomarkers, and achieves an accuracy of 96.15%, an\nF1-score of 94.24%, a sensitivity of 97.56%, and a specificity of 90.91% on ALL\nIDB 1. Our method was further evaluated on an out-of-distribution dataset,\nwhich posed a challenging test and had acceptable performance. Notably, our\nmodel was trained on a relatively small dataset, highlighting the potential for\nour approach to be applied to other medical datasets with limited data\navailability.\n","authors":["Amirhossein Askari Farsangi","Ali Sharifi-Zarchi","Mohammad Hossein Rohban"],"pdf_url":"https://arxiv.org/pdf/2307.04014v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04144v1","updated":"2025-03-06T06:41:38Z","published":"2025-03-06T06:41:38Z","title":"DM-Adapter: Domain-Aware Mixture-of-Adapters for Text-Based Person\n Retrieval","summary":" Text-based person retrieval (TPR) has gained significant attention as a\nfine-grained and challenging task that closely aligns with practical\napplications. Tailoring CLIP to person domain is now a emerging research topic\ndue to the abundant knowledge of vision-language pretraining, but challenges\nstill remain during fine-tuning: (i) Previous full-model fine-tuning in TPR is\ncomputationally expensive and prone to overfitting.(ii) Existing\nparameter-efficient transfer learning (PETL) for TPR lacks of fine-grained\nfeature extraction. To address these issues, we propose Domain-Aware\nMixture-of-Adapters (DM-Adapter), which unifies Mixture-of-Experts (MOE) and\nPETL to enhance fine-grained feature representations while maintaining\nefficiency. Specifically, Sparse Mixture-of-Adapters is designed in parallel to\nMLP layers in both vision and language branches, where different experts\nspecialize in distinct aspects of person knowledge to handle features more\nfinely. To promote the router to exploit domain information effectively and\nalleviate the routing imbalance, Domain-Aware Router is then developed by\nbuilding a novel gating function and injecting learnable domain-aware prompts.\nExtensive experiments show that our DM-Adapter achieves state-of-the-art\nperformance, outperforming previous methods by a significant margin.\n","authors":["Yating Liu","Zimo Liu","Xiangyuan Lan","Wenming Yang","Yaowei Li","Qingmin Liao"],"pdf_url":"https://arxiv.org/pdf/2503.04144v1.pdf","comment":"9 pages, 5 figures, accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2503.04139v1","updated":"2025-03-06T06:35:19Z","published":"2025-03-06T06:35:19Z","title":"Robust Computer-Vision based Construction Site Detection for\n Assistive-Technology Applications","summary":" Navigating urban environments poses significant challenges for people with\ndisabilities, particularly those with blindness and low vision. Environments\nwith dynamic and unpredictable elements like construction sites are especially\nchallenging. Construction sites introduce hazards like uneven surfaces,\nobstructive barriers, hazardous materials, and excessive noise, and they can\nalter routing, complicating safe mobility. Existing assistive technologies are\nlimited, as navigation apps do not account for construction sites during trip\nplanning, and detection tools that attempt hazard recognition struggle to\naddress the extreme variability of construction paraphernalia. This study\nintroduces a novel computer vision-based system that integrates open-vocabulary\nobject detection, a YOLO-based scaffolding-pole detection model, and an optical\ncharacter recognition (OCR) module to comprehensively identify and interpret\nconstruction site elements for assistive navigation. In static testing across\nseven construction sites, the system achieved an overall accuracy of 88.56\\%,\nreliably detecting objects from 2m to 10m within a 0$^\\circ$ -- 75$^\\circ$\nangular offset. At closer distances (2--4m), the detection rate was 100\\% at\nall tested angles. At\n","authors":["Junchi Feng","Giles Hamilton-Fletcher","Nikhil Ballem","Michael Batavia","Yifei Wang","Jiuling Zhong","Maurizio Porfiri","John-Ross Rizzo"],"pdf_url":"https://arxiv.org/pdf/2503.04139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04134v1","updated":"2025-03-06T06:26:57Z","published":"2025-03-06T06:26:57Z","title":"Real-time Spatial-temporal Traversability Assessment via Feature-based\n Sparse Gaussian Process","summary":" Terrain analysis is critical for the practical application of ground mobile\nrobots in real-world tasks, especially in outdoor unstructured environments. In\nthis paper, we propose a novel spatial-temporal traversability assessment\nmethod, which aims to enable autonomous robots to effectively navigate through\ncomplex terrains. Our approach utilizes sparse Gaussian processes (SGP) to\nextract geometric features (curvature, gradient, elevation, etc.) directly from\npoint cloud scans. These features are then used to construct a high-resolution\nlocal traversability map. Then, we design a spatial-temporal Bayesian Gaussian\nkernel (BGK) inference method to dynamically evaluate traversability scores,\nintegrating historical and real-time data while considering factors such as\nslope, flatness, gradient, and uncertainty metrics. GPU acceleration is applied\nin the feature extraction step, and the system achieves real-time performance.\nExtensive simulation experiments across diverse terrain scenarios demonstrate\nthat our method outperforms SOTA approaches in both accuracy and computational\nefficiency. Additionally, we develop an autonomous navigation framework\nintegrated with the traversability map and validate it with a differential\ndriven vehicle in complex outdoor environments. Our code will be open-source\nfor further research and development by the community,\nhttps://github.com/ZJU-FAST-Lab/FSGP_BGK.\n","authors":["Senming Tan","Zhenyu Hou","Zhihao Zhang","Long Xu","Mengke Zhang","Zhaoqi He","Chao Xu","Fei Gao","Yanjun Cao"],"pdf_url":"https://arxiv.org/pdf/2503.04134v1.pdf","comment":"8 pages, 10 figures"},{"id":"http://arxiv.org/abs/2503.04131v1","updated":"2025-03-06T06:24:51Z","published":"2025-03-06T06:24:51Z","title":"Q-PART: Quasi-Periodic Adaptive Regression with Test-time Training for\n Pediatric Left Ventricular Ejection Fraction Regression","summary":" In this work, we address the challenge of adaptive pediatric Left Ventricular\nEjection Fraction (LVEF) assessment. While Test-time Training (TTT) approaches\nshow promise for this task, they suffer from two significant limitations.\nExisting TTT works are primarily designed for classification tasks rather than\ncontinuous value regression, and they lack mechanisms to handle the\nquasi-periodic nature of cardiac signals. To tackle these issues, we propose a\nnovel \\textbf{Q}uasi-\\textbf{P}eriodic \\textbf{A}daptive \\textbf{R}egression\nwith \\textbf{T}est-time Training (Q-PART) framework. In the training stage, the\nproposed Quasi-Period Network decomposes the echocardiogram into periodic and\naperiodic components within latent space by combining parameterized helix\ntrajectories with Neural Controlled Differential Equations. During inference,\nour framework further employs a variance minimization strategy across image\naugmentations that simulate common quality issues in echocardiogram\nacquisition, along with differential adaptation rates for periodic and\naperiodic components. Theoretical analysis is provided to demonstrate that our\nvariance minimization objective effectively bounds the regression error under\nmild conditions. Furthermore, extensive experiments across three pediatric age\ngroups demonstrate that Q-PART not only significantly outperforms existing\napproaches in pediatric LVEF prediction, but also exhibits strong clinical\nscreening capability with high mAUROC scores (up to 0.9747) and maintains\ngender-fair performance across all metrics, validating its robustness and\npractical utility in pediatric echocardiography analysis.\n","authors":["Jie Liu","Tiexin Qin","Hui Liu","Yilei Shi","Lichao Mou","Xiao Xiang Zhu","Shiqi Wang","Haoliang Li"],"pdf_url":"https://arxiv.org/pdf/2503.04131v1.pdf","comment":"Accepted to CVPR 2025"},{"id":"http://arxiv.org/abs/2503.04130v1","updated":"2025-03-06T06:17:38Z","published":"2025-03-06T06:17:38Z","title":"Token-Efficient Long Video Understanding for Multimodal LLMs","summary":" Recent advances in video-based multimodal large language models (Video-LLMs)\nhave significantly improved video understanding by processing videos as\nsequences of image frames. However, many existing methods treat frames\nindependently in the vision backbone, lacking explicit temporal modeling, which\nlimits their ability to capture dynamic patterns and efficiently handle long\nvideos. To address these limitations, we introduce STORM\n(\\textbf{S}patiotemporal \\textbf{TO}ken \\textbf{R}eduction for\n\\textbf{M}ultimodal LLMs), a novel architecture incorporating a dedicated\ntemporal encoder between the image encoder and the LLM. Our temporal encoder\nleverages the Mamba State Space Model to integrate temporal information into\nimage tokens, generating enriched representations that preserve inter-frame\ndynamics across the entire video sequence. This enriched encoding not only\nenhances video reasoning capabilities but also enables effective token\nreduction strategies, including test-time sampling and training-based temporal\nand spatial pooling, substantially reducing computational demands on the LLM\nwithout sacrificing key temporal information. By integrating these techniques,\nour approach simultaneously reduces training and inference latency while\nimproving performance, enabling efficient and robust video understanding over\nextended temporal contexts. Extensive evaluations show that STORM achieves\nstate-of-the-art results across various long video understanding benchmarks\n(more than 5\\% improvement on MLVU and LongVideoBench) while reducing the\ncomputation costs by up to $8\\times$ and the decoding latency by\n2.4-2.9$\\times$ for the fixed numbers of input frames. Project page is\navailable at https://research.nvidia.com/labs/lpr/storm\n","authors":["Jindong Jiang","Xiuyu Li","Zhijian Liu","Muyang Li","Guo Chen","Zhiqi Li","De-An Huang","Guilin Liu","Zhiding Yu","Kurt Keutzer","Sungjin Ahn","Jan Kautz","Hongxu Yin","Yao Lu","Song Han","Wonmin Byeon"],"pdf_url":"https://arxiv.org/pdf/2503.04130v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04127v1","updated":"2025-03-06T06:13:27Z","published":"2025-03-06T06:13:27Z","title":"Diff-Reg v2: Diffusion-Based Matching Matrix Estimation for Image\n Matching and 3D Registration","summary":" Establishing reliable correspondences is crucial for all registration tasks,\nincluding 2D image registration, 3D point cloud registration, and 2D-3D\nimage-to-point cloud registration. However, these tasks are often complicated\nby challenges such as scale inconsistencies, symmetry, and large deformations,\nwhich can lead to ambiguous matches. Previous feature-based and\ncorrespondence-based methods typically rely on geometric or semantic features\nto generate or polish initial potential correspondences. Some methods typically\nleverage specific geometric priors, such as topological preservation, to devise\ndiverse and innovative strategies tailored to a given enhancement goal, which\ncannot be exhaustively enumerated. Additionally, many previous approaches rely\non a single-step prediction head, which can struggle with local minima in\ncomplex matching scenarios. To address these challenges, we introduce an\ninnovative paradigm that leverages a diffusion model in matrix space for robust\nmatching matrix estimation. Our model treats correspondence estimation as a\ndenoising diffusion process in the matching matrix space, gradually refining\nthe intermediate matching matrix to the optimal one. Specifically, we apply the\ndiffusion model in the doubly stochastic matrix space for 3D-3D and 2D-3D\nregistration tasks. In the 2D image registration task, we deploy the diffusion\nmodel in a matrix subspace where dual-softmax projection regularization is\napplied. For all three registration tasks, we provide adaptive matching matrix\nembedding implementations tailored to the specific characteristics of each task\nwhile maintaining a consistent \"match-to-warp\" encoding pattern. Furthermore,\nwe adopt a lightweight design for the denoising module. In inference, once\npoints or image features are extracted and fixed, this module performs\nmulti-step denoising predictions through reverse sampling.\n","authors":["Qianliang Wu","Haobo Jiang","Yaqing Ding","Lei Luo","Jin Xie","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2503.04127v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2403.19919"},{"id":"http://arxiv.org/abs/2503.04126v1","updated":"2025-03-06T06:10:21Z","published":"2025-03-06T06:10:21Z","title":"DVM-SLAM: Decentralized Visual Monocular Simultaneous Localization and\n Mapping for Multi-Agent Systems","summary":" Cooperative Simultaneous Localization and Mapping (C-SLAM) enables multiple\nagents to work together in mapping unknown environments while simultaneously\nestimating their own positions. This approach enhances robustness, scalability,\nand accuracy by sharing information between agents, reducing drift, and\nenabling collective exploration of larger areas. In this paper, we present\nDecentralized Visual Monocular SLAM (DVM-SLAM), the first open-source\ndecentralized monocular C-SLAM system. By only utilizing low-cost and\nlight-weight monocular vision sensors, our system is well suited for small\nrobots and micro aerial vehicles (MAVs). DVM-SLAM's real-world applicability is\nvalidated on physical robots with a custom collision avoidance framework,\nshowcasing its potential in real-time multi-agent autonomous navigation\nscenarios. We also demonstrate comparable accuracy to state-of-the-art\ncentralized monocular C-SLAM systems. We open-source our code and provide\nsupplementary material online.\n","authors":["Joshua Bird","Jan Blumenkamp","Amanda Prorok"],"pdf_url":"https://arxiv.org/pdf/2503.04126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04123v1","updated":"2025-03-06T06:00:55Z","published":"2025-03-06T06:00:55Z","title":"GAGrasp: Geometric Algebra Diffusion for Dexterous Grasping","summary":" We propose GAGrasp, a novel framework for dexterous grasp generation that\nleverages geometric algebra representations to enforce equivariance to SE(3)\ntransformations. By encoding the SE(3) symmetry constraint directly into the\narchitecture, our method improves data and parameter efficiency while enabling\nrobust grasp generation across diverse object poses. Additionally, we\nincorporate a differentiable physics-informed refinement layer, which ensures\nthat generated grasps are physically plausible and stable. Extensive\nexperiments demonstrate the model's superior performance in generalization,\nstability, and adaptability compared to existing methods. Additional details at\nhttps://gagrasp.github.io/\n","authors":["Tao Zhong","Christine Allen-Blanchette"],"pdf_url":"https://arxiv.org/pdf/2503.04123v1.pdf","comment":"Accepted at ICRA 2025"},{"id":"http://arxiv.org/abs/2503.00675v2","updated":"2025-03-06T05:59:08Z","published":"2025-03-02T00:40:50Z","title":"Dur360BEV: A Real-world 360-degree Single Camera Dataset and Benchmark\n for Bird-Eye View Mapping in Autonomous Driving","summary":" We present Dur360BEV, a novel spherical camera autonomous driving dataset\nequipped with a high-resolution 128-channel 3D LiDAR and a RTK-refined GNSS/INS\nsystem, along with a benchmark architecture designed to generate Bird-Eye-View\n(BEV) maps using only a single spherical camera. This dataset and benchmark\naddress the challenges of BEV generation in autonomous driving, particularly by\nreducing hardware complexity through the use of a single 360-degree camera\ninstead of multiple perspective cameras. Within our benchmark architecture, we\npropose a novel spherical-image-to-BEV module that leverages spherical imagery\nand a refined sampling strategy to project features from 2D to 3D. Our approach\nalso includes an innovative application of focal loss, specifically adapted to\naddress the extreme class imbalance often encountered in BEV segmentation\ntasks, that demonstrates improved segmentation performance on the Dur360BEV\ndataset. The results show that our benchmark not only simplifies the sensor\nsetup but also achieves competitive performance.\n","authors":["Wenke E","Chao Yuan","Li Li","Yixin Sun","Yona Falinie A. Gaus","Amir Atapour-Abarghouei","Toby P. Breckon"],"pdf_url":"https://arxiv.org/pdf/2503.00675v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04121v1","updated":"2025-03-06T05:58:41Z","published":"2025-03-06T05:58:41Z","title":"Simple Self Organizing Map with Visual Transformer","summary":" Vision Transformers (ViTs) have demonstrated exceptional performance in\nvarious vision tasks. However, they tend to underperform on smaller datasets\ndue to their inherent lack of inductive biases. Current approaches address this\nlimitation implicitly-often by pairing ViTs with pretext tasks or by distilling\nknowledge from convolutional neural networks (CNNs) to strengthen the prior. In\ncontrast, Self-Organizing Maps (SOMs), a widely adopted self-supervised\nframework, are inherently structured to preserve topology and spatial\norganization, making them a promising candidate to directly address the\nlimitations of ViTs in limited or small training datasets. Despite this\npotential, equipping SOMs with modern deep learning architectures remains\nlargely unexplored. In this study, we conduct a novel exploration on how Vision\nTransformers (ViTs) and Self-Organizing Maps (SOMs) can empower each other,\naiming to bridge this critical research gap. Our findings demonstrate that\nthese architectures can synergistically enhance each other, leading to\nsignificantly improved performance in both unsupervised and supervised tasks.\nCode will be publicly available.\n","authors":["Alan Luo","Kaiwen Yuan"],"pdf_url":"https://arxiv.org/pdf/2503.04121v1.pdf","comment":"5 pages, 4 figures. Submitted to IEEE. All experiments and code work\n were performed by the first author, with the second author serving in a\n PI/mentor role, guiding the progression of the work"},{"id":"http://arxiv.org/abs/2503.04119v1","updated":"2025-03-06T05:56:25Z","published":"2025-03-06T05:56:25Z","title":"SCSA: A Plug-and-Play Semantic Continuous-Sparse Attention for Arbitrary\n Semantic Style Transfer","summary":" Attention-based arbitrary style transfer methods, including CNN-based,\nTransformer-based, and Diffusion-based, have flourished and produced\nhigh-quality stylized images. However, they perform poorly on the content and\nstyle images with the same semantics, i.e., the style of the corresponding\nsemantic region of the generated stylized image is inconsistent with that of\nthe style image. We argue that the root cause lies in their failure to consider\nthe relationship between local regions and semantic regions. To address this\nissue, we propose a plug-and-play semantic continuous-sparse attention, dubbed\nSCSA, for arbitrary semantic style transfer -- each query point considers\ncertain key points in the corresponding semantic region. Specifically, semantic\ncontinuous attention ensures each query point fully attends to all the\ncontinuous key points in the same semantic region that reflect the overall\nstyle characteristics of that region; Semantic sparse attention allows each\nquery point to focus on the most similar sparse key point in the same semantic\nregion that exhibits the specific stylistic texture of that region. By\ncombining the two modules, the resulting SCSA aligns the overall style of the\ncorresponding semantic regions while transferring the vivid textures of these\nregions. Qualitative and quantitative results prove that SCSA enables\nattention-based arbitrary style transfer methods to produce high-quality\nsemantic stylized images.\n","authors":["Chunnan Shang","Zhizhong Wang","Hongwei Wang","Xiangming Meng"],"pdf_url":"https://arxiv.org/pdf/2503.04119v1.pdf","comment":"Accepted by CVPR 2025"},{"id":"http://arxiv.org/abs/2410.24185v2","updated":"2025-03-06T05:34:17Z","published":"2024-10-31T17:48:45Z","title":"DexMimicGen: Automated Data Generation for Bimanual Dexterous\n Manipulation via Imitation Learning","summary":" Imitation learning from human demonstrations is an effective means to teach\nrobots manipulation skills. But data acquisition is a major bottleneck in\napplying this paradigm more broadly, due to the amount of cost and human effort\ninvolved. There has been significant interest in imitation learning for\nbimanual dexterous robots, like humanoids. Unfortunately, data collection is\neven more challenging here due to the challenges of simultaneously controlling\nmultiple arms and multi-fingered hands. Automated data generation in simulation\nis a compelling, scalable alternative to fuel this need for data. To this end,\nwe introduce DexMimicGen, a large-scale automated data generation system that\nsynthesizes trajectories from a handful of human demonstrations for humanoid\nrobots with dexterous hands. We present a collection of simulation environments\nin the setting of bimanual dexterous manipulation, spanning a range of\nmanipulation behaviors and different requirements for coordination among the\ntwo arms. We generate 21K demos across these tasks from just 60 source human\ndemos and study the effect of several data generation and policy learning\ndecisions on agent performance. Finally, we present a real-to-sim-to-real\npipeline and deploy it on a real-world humanoid can sorting task. Generated\ndatasets, simulation environments and additional results are at\nhttps://dexmimicgen.github.io/\n","authors":["Zhenyu Jiang","Yuqi Xie","Kevin Lin","Zhenjia Xu","Weikang Wan","Ajay Mandlekar","Linxi Fan","Yuke Zhu"],"pdf_url":"https://arxiv.org/pdf/2410.24185v2.pdf","comment":"ICRA 2025. Project website: https://dexmimicgen.github.io/"},{"id":"http://arxiv.org/abs/2503.04107v1","updated":"2025-03-06T05:29:20Z","published":"2025-03-06T05:29:20Z","title":"Fractional Correspondence Framework in Detection Transformer","summary":" The Detection Transformer (DETR), by incorporating the Hungarian algorithm,\nhas significantly simplified the matching process in object detection tasks.\nThis algorithm facilitates optimal one-to-one matching of predicted bounding\nboxes to ground-truth annotations during training. While effective, this strict\nmatching process does not inherently account for the varying densities and\ndistributions of objects, leading to suboptimal correspondences such as failing\nto handle multiple detections of the same object or missing small objects. To\naddress this, we propose the Regularized Transport Plan (RTP). RTP introduces a\nflexible matching strategy that captures the cost of aligning predictions with\nground truths to find the most accurate correspondences between these sets. By\nutilizing the differentiable Sinkhorn algorithm, RTP allows for soft,\nfractional matching rather than strict one-to-one assignments. This approach\nenhances the model's capability to manage varying object densities and\ndistributions effectively. Our extensive evaluations on the MS-COCO and VOC\nbenchmarks demonstrate the effectiveness of our approach. RTP-DETR, surpassing\nthe performance of the Deform-DETR and the recently introduced DINO-DETR,\nachieving absolute gains in mAP of +3.8% and +1.7%, respectively.\n","authors":["Masoumeh Zareapoor","Pourya Shamsolmoali","Huiyu Zhou","Yue Lu","Salvador García"],"pdf_url":"https://arxiv.org/pdf/2503.04107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04106v1","updated":"2025-03-06T05:28:44Z","published":"2025-03-06T05:28:44Z","title":"WeakMedSAM: Weakly-Supervised Medical Image Segmentation via SAM with\n Sub-Class Exploration and Prompt Affinity Mining","summary":" We have witnessed remarkable progress in foundation models in vision tasks.\nCurrently, several recent works have utilized the segmenting anything model\n(SAM) to boost the segmentation performance in medical images, where most of\nthem focus on training an adaptor for fine-tuning a large amount of pixel-wise\nannotated medical images following a fully supervised manner. In this paper, to\nreduce the labeling cost, we investigate a novel weakly-supervised SAM-based\nsegmentation model, namely WeakMedSAM. Specifically, our proposed WeakMedSAM\ncontains two modules: 1) to mitigate severe co-occurrence in medical images, a\nsub-class exploration module is introduced to learn accurate feature\nrepresentations. 2) to improve the quality of the class activation maps, our\nprompt affinity mining module utilizes the prompt capability of SAM to obtain\nan affinity map for random-walk refinement. Our method can be applied to any\nSAM-like backbone, and we conduct experiments with SAMUS and EfficientSAM. The\nexperimental results on three popularly-used benchmark datasets, i.e., BraTS\n2019, AbdomenCT-1K, and MSD Cardiac dataset, show the promising results of our\nproposed WeakMedSAM. Our code is available at\nhttps://github.com/wanghr64/WeakMedSAM.\n","authors":["Haoran Wang","Lian Huai","Wenbin Li","Lei Qi","Xingqun Jiang","Yinghuan Shi"],"pdf_url":"https://arxiv.org/pdf/2503.04106v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.02910v2","updated":"2025-03-06T05:19:44Z","published":"2025-03-04T06:17:17Z","title":"LangGas: Introducing Language in Selective Zero-Shot Background\n Subtraction for Semi-Transparent Gas Leak Detection with a New Dataset","summary":" Gas leakage poses a significant hazard that requires prevention.\nTraditionally, human inspection has been used for detection, a slow and\nlabour-intensive process. Recent research has applied machine learning\ntechniques to this problem, yet there remains a shortage of high-quality,\npublicly available datasets. This paper introduces a synthetic dataset\nfeaturing diverse backgrounds, interfering foreground objects, diverse leak\nlocations, and precise segmentation ground truth. We propose a zero-shot method\nthat combines background subtraction, zero-shot object detection, filtering,\nand segmentation to leverage this dataset. Experimental results indicate that\nour approach significantly outperforms baseline methods based solely on\nbackground subtraction and zero-shot object detection with segmentation,\nreaching an IoU of 69\\% overall. We also present an analysis of various prompt\nconfigurations and threshold settings to provide deeper insights into the\nperformance of our method. The code and dataset will be released after\npublication.\n","authors":["Wenqi Guo","Yiyang Du","Shan Du"],"pdf_url":"https://arxiv.org/pdf/2503.02910v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.14909v2","updated":"2025-03-06T05:18:12Z","published":"2025-02-19T02:56:27Z","title":"Comparing Deep Neural Network for Multi-Label ECG Diagnosis From Scanned\n ECG","summary":" Automated ECG diagnosis has seen significant advancements with deep learning\ntechniques, but real-world applications still face challenges when dealing with\nscanned paper ECGs. In this study, we explore multi-label classification of\nECGs extracted from scanned images, moving beyond traditional binary\nclassification (normal/abnormal). We evaluate the performance of multiple deep\nneural network architectures, including AlexNet, VGG, ResNet, and Vision\nTransformer, on scanned ECG datasets. Our comparative analysis examines model\naccuracy, robustness to image artifacts, and generalizability across different\nECG conditions. Additionally, we investigate whether ECG signals extracted from\nscanned images retain sufficient diagnostic information for reliable automated\nclassification. The findings highlight the strengths and limitations of each\narchitecture, providing insights into the feasibility of image-based ECG\ndiagnosis and its potential integration into clinical workflows.\n","authors":["Cuong V. Nguyen","Hieu X. Nguyen","Dung D. Pham Minh","Cuong D. Do"],"pdf_url":"https://arxiv.org/pdf/2502.14909v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04096v1","updated":"2025-03-06T05:13:19Z","published":"2025-03-06T05:13:19Z","title":"Image-Based Relocalization and Alignment for Long-Term Monitoring of\n Dynamic Underwater Environments","summary":" Effective monitoring of underwater ecosystems is crucial for tracking\nenvironmental changes, guiding conservation efforts, and ensuring long-term\necosystem health. However, automating underwater ecosystem management with\nrobotic platforms remains challenging due to the complexities of underwater\nimagery, which pose significant difficulties for traditional visual\nlocalization methods. We propose an integrated pipeline that combines Visual\nPlace Recognition (VPR), feature matching, and image segmentation on\nvideo-derived images. This method enables robust identification of revisited\nareas, estimation of rigid transformations, and downstream analysis of\necosystem changes. Furthermore, we introduce the SQUIDLE+ VPR Benchmark-the\nfirst large-scale underwater VPR benchmark designed to leverage an extensive\ncollection of unstructured data from multiple robotic platforms, spanning time\nintervals from days to years. The dataset encompasses diverse trajectories,\narbitrary overlap and diverse seafloor types captured under varying\nenvironmental conditions, including differences in depth, lighting, and\nturbidity. Our code is available at: https://github.com/bev-gorry/underloc\n","authors":["Beverley Gorry","Tobias Fischer","Michael Milford","Alejandro Fontan"],"pdf_url":"https://arxiv.org/pdf/2503.04096v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.16805v3","updated":"2025-03-06T05:06:49Z","published":"2024-11-25T14:38:43Z","title":"Human Motion Instruction Tuning","summary":" This paper presents LLaMo (Large Language and Human Motion Assistant), a\nmultimodal framework for human motion instruction tuning. In contrast to\nconventional instruction-tuning approaches that convert non-linguistic inputs,\nsuch as video or motion sequences, into language tokens, LLaMo retains motion\nin its native form for instruction tuning. This method preserves\nmotion-specific details that are often diminished in tokenization, thereby\nimproving the model's ability to interpret complex human behaviors. By\nprocessing both video and motion data alongside textual inputs, LLaMo enables a\nflexible, human-centric analysis. Experimental evaluations across\nhigh-complexity domains, including human behaviors and professional activities,\nindicate that LLaMo effectively captures domain-specific knowledge, enhancing\ncomprehension and prediction in motion-intensive scenarios. We hope LLaMo\noffers a foundation for future multimodal AI systems with broad applications,\nfrom sports analytics to behavioral prediction. Our code and models are\navailable on the project website: https://github.com/ILGLJ/LLaMo.\n","authors":["Lei Li","Sen Jia","Wang Jianhao","Zhongyu Jiang","Feng Zhou","Ju Dai","Tianfang Zhang","Wu Zongkai","Jenq-Neng Hwang"],"pdf_url":"https://arxiv.org/pdf/2411.16805v3.pdf","comment":"Accepted by CVPR 2025"},{"id":"http://arxiv.org/abs/2503.04087v1","updated":"2025-03-06T04:50:07Z","published":"2025-03-06T04:50:07Z","title":"Brain Tumor Detection in MRI Based on Federated Learning with YOLOv11","summary":" One of the primary challenges in medical diagnostics is the accurate and\nefficient use of magnetic resonance imaging (MRI) for the detection of brain\ntumors. But the current machine learning (ML) approaches have two major\nlimitations, data privacy and high latency. To solve the problem, in this work\nwe propose a federated learning architecture for a better accurate brain tumor\ndetection incorporating the YOLOv11 algorithm. In contrast to earlier methods\nof centralized learning, our federated learning approach protects the\nunderlying medical data while supporting cooperative deep learning model\ntraining across multiple institutions. To allow the YOLOv11 model to locate and\nidentify tumor areas, we adjust it to handle MRI data. To ensure robustness and\ngeneralizability, the model is trained and tested on a wide range of MRI data\ncollected from several anonymous medical facilities. The results indicate that\nour method significantly maintains higher accuracy than conventional\napproaches.\n","authors":["Sheikh Moonwara Anjum Monisha","Ratun Rahman"],"pdf_url":"https://arxiv.org/pdf/2503.04087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14701v4","updated":"2025-03-06T04:38:23Z","published":"2024-05-23T15:35:48Z","title":"DreamText: High Fidelity Scene Text Synthesis","summary":" Scene text synthesis involves rendering specified texts onto arbitrary\nimages. Current methods typically formulate this task in an end-to-end manner\nbut lack effective character-level guidance during training. Besides, their\ntext encoders, pre-trained on a single font type, struggle to adapt to the\ndiverse font styles encountered in practical applications. Consequently, these\nmethods suffer from character distortion, repetition, and absence, particularly\nin polystylistic scenarios. To this end, this paper proposes DreamText for\nhigh-fidelity scene text synthesis. Our key idea is to reconstruct the\ndiffusion training process, introducing more refined guidance tailored to this\ntask, to expose and rectify the model's attention at the character level and\nstrengthen its learning of text regions. This transformation poses a hybrid\noptimization challenge, involving both discrete and continuous variables. To\neffectively tackle this challenge, we employ a heuristic alternate optimization\nstrategy. Meanwhile, we jointly train the text encoder and generator to\ncomprehensively learn and utilize the diverse font present in the training\ndataset. This joint training is seamlessly integrated into the alternate\noptimization process, fostering a synergistic relationship between learning\ncharacter embedding and re-estimating character attention. Specifically, in\neach step, we first encode potential character-generated position information\nfrom cross-attention maps into latent character masks. These masks are then\nutilized to update the representation of specific characters in the current\nstep, which, in turn, enables the generator to correct the character's\nattention in the subsequent steps. Both qualitative and quantitative results\ndemonstrate the superiority of our method to the state of the art.\n","authors":["Yibin Wang","Weizhong Zhang","Honghui Xu","Cheng Jin"],"pdf_url":"https://arxiv.org/pdf/2405.14701v4.pdf","comment":"Code: https://github.com/CodeGoat24/DreamText, Project page:\n https://codegoat24.github.io/DreamText/"},{"id":"http://arxiv.org/abs/2503.04082v1","updated":"2025-03-06T04:37:09Z","published":"2025-03-06T04:37:09Z","title":"Instrument-Splatting: Controllable Photorealistic Reconstruction of\n Surgical Instruments Using Gaussian Splatting","summary":" Real2Sim is becoming increasingly important with the rapid development of\nsurgical artificial intelligence (AI) and autonomy. In this work, we propose a\nnovel Real2Sim methodology, \\textit{Instrument-Splatting}, that leverages 3D\nGaussian Splatting to provide fully controllable 3D reconstruction of surgical\ninstruments from monocular surgical videos. To maintain both high visual\nfidelity and manipulability, we introduce a geometry pre-training to bind\nGaussian point clouds on part mesh with accurate geometric priors and define a\nforward kinematics to control the Gaussians as flexible as real instruments.\nAfterward, to handle unposed videos, we design a novel instrument pose tracking\nmethod leveraging semantics-embedded Gaussians to robustly refine per-frame\ninstrument poses and joint states in a render-and-compare manner, which allows\nour instrument Gaussian to accurately learn textures and reach photorealistic\nrendering. We validated our method on 2 publicly released surgical videos and 4\nvideos collected on ex vivo tissues and green screens. Quantitative and\nqualitative evaluations demonstrate the effectiveness and superiority of the\nproposed method.\n","authors":["Shuojue Yang","Zijian Wu","Mingxuan Hong","Qian Li","Daiyun Shen","Septimiu E. Salcudean","Yueming Jin"],"pdf_url":"https://arxiv.org/pdf/2503.04082v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2503.04079v1","updated":"2025-03-06T04:33:19Z","published":"2025-03-06T04:33:19Z","title":"Surgical Gaussian Surfels: Highly Accurate Real-time Surgical Scene\n Rendering","summary":" Accurate geometric reconstruction of deformable tissues in monocular\nendoscopic video remains a fundamental challenge in robot-assisted minimally\ninvasive surgery. Although recent volumetric and point primitive methods based\non neural radiance fields (NeRF) and 3D Gaussian primitives have efficiently\nrendered surgical scenes, they still struggle with handling artifact-free tool\nocclusions and preserving fine anatomical details. These limitations stem from\nunrestricted Gaussian scaling and insufficient surface alignment constraints\nduring reconstruction. To address these issues, we introduce Surgical Gaussian\nSurfels (SGS), which transforms anisotropic point primitives into\nsurface-aligned elliptical splats by constraining the scale component of the\nGaussian covariance matrix along the view-aligned axis. We predict accurate\nsurfel motion fields using a lightweight Multi-Layer Perceptron (MLP) coupled\nwith locality constraints to handle complex tissue deformations. We use\nhomodirectional view-space positional gradients to capture fine image details\nby splitting Gaussian Surfels in over-reconstructed regions. In addition, we\ndefine surface normals as the direction of the steepest density change within\neach Gaussian surfel primitive, enabling accurate normal estimation without\nrequiring monocular normal priors. We evaluate our method on two in-vivo\nsurgical datasets, where it outperforms current state-of-the-art methods in\nsurface geometry, normal map quality, and rendering efficiency, while remaining\ncompetitive in real-time rendering performance. We make our code available at\nhttps://github.com/aloma85/SurgicalGaussianSurfels\n","authors":["Idris O. Sunmola","Zhenjun Zhao","Samuel Schmidgall","Yumeng Wang","Paul Maria Scheikl","Axel Krieger"],"pdf_url":"https://arxiv.org/pdf/2503.04079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.20637v2","updated":"2025-03-06T04:31:21Z","published":"2025-02-28T01:36:38Z","title":"TractCloud-FOV: Deep Learning-based Robust Tractography Parcellation in\n Diffusion MRI with Incomplete Field of View","summary":" Tractography parcellation classifies streamlines reconstructed from diffusion\nMRI into anatomically defined fiber tracts for clinical and research\napplications. However, clinical scans often have incomplete fields of view\n(FOV) where brain regions are partially imaged, leading to partial or truncated\nfiber tracts. To address this challenge, we introduce TractCloud-FOV, a deep\nlearning framework that robustly parcellates tractography under conditions of\nincomplete FOV. We propose a novel training strategy, FOV-Cut Augmentation\n(FOV-CA), in which we synthetically cut tractograms to simulate a spectrum of\nreal-world inferior FOV cutoff scenarios. This data augmentation approach\nenriches the training set with realistic truncated streamlines, enabling the\nmodel to achieve superior generalization. We evaluate the proposed\nTractCloud-FOV on both synthetically cut tractography and two real-life\ndatasets with incomplete FOV. TractCloud-FOV significantly outperforms several\nstate-of-the-art methods on all testing datasets in terms of streamline\nclassification accuracy, generalization ability, tract anatomical depiction,\nand computational efficiency. Overall, TractCloud-FOV achieves efficient and\nconsistent tractography parcellation in diffusion MRI with incomplete FOV.\n","authors":["Yuqian Chen","Leo Zekelman","Yui Lo","Suheyla Cetin-Karayumak","Tengfei Xue","Yogesh Rathi","Nikos Makris","Fan Zhang","Weidong Cai","Lauren J. O'Donnell"],"pdf_url":"https://arxiv.org/pdf/2502.20637v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04078v1","updated":"2025-03-06T04:28:11Z","published":"2025-03-06T04:28:11Z","title":"Spatial-Temporal Perception with Causal Inference for Naturalistic\n Driving Action Recognition","summary":" Naturalistic driving action recognition is essential for vehicle cabin\nmonitoring systems. However, the complexity of real-world backgrounds presents\nsignificant challenges for this task, and previous approaches have struggled\nwith practical implementation due to their limited ability to observe subtle\nbehavioral differences and effectively learn inter-frame features from video.\nIn this paper, we propose a novel Spatial-Temporal Perception (STP)\narchitecture that emphasizes both temporal information and spatial\nrelationships between key objects, incorporating a causal decoder to perform\nbehavior recognition and temporal action localization. Without requiring\nmultimodal input, STP directly extracts temporal and spatial distance features\nfrom RGB video clips. Subsequently, these dual features are jointly encoded by\nmaximizing the expected likelihood across all possible permutations of the\nfactorization order. By integrating temporal and spatial features at different\nscales, STP can perceive subtle behavioral changes in challenging scenarios.\nAdditionally, we introduce a causal-aware module to explore relationships\nbetween video frame features, significantly enhancing detection efficiency and\nperformance. We validate the effectiveness of our approach using two publicly\navailable driver distraction detection benchmarks. The results demonstrate that\nour framework achieves state-of-the-art performance.\n","authors":["Qing Chang","Wei Dai","Zhihao Shuai","Limin Yu","Yutao Yue"],"pdf_url":"https://arxiv.org/pdf/2503.04078v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.17741v5","updated":"2025-03-06T04:11:30Z","published":"2024-12-23T17:44:05Z","title":"Reasoning to Attend: Try to Understand How Token Works","summary":" Current Large Multimodal Models (LMMs) empowered visual grounding typically\nrely on $\\texttt{}$ tokens as a text prompt to jointly optimize the\nvision-language model (e.g., LLaVA) and the downstream task-specific model\n(e.g., SAM). However, we observe that little research has looked into how it\nworks.In this work, we first visualize the similarity maps, which are obtained\nby computing the semantic similarity between the $\\texttt{}$ token and the\nimage token embeddings derived from the last hidden layer in both the LLaVA\nencoder and SAM decoder. Intriguingly, we have found that a striking\nconsistency holds in terms of activation responses in the similarity map, which\nreveals that what the $\\texttt{}$ token contributes to is semantic\nsimilarity within image-text pairs. Specifically, the $\\texttt{}$ token, a\nplaceholder expanded in text vocabulary, extensively queries among individual\ntokenized image patches to match the semantics of an object from text to the\npaired image, while the Large Language Models (LLMs) are being fine-tuned. Upon\nthe above findings, we present READ, which facilitates LMMs' resilient\n$\\textbf{REA}$soning capability of where to atten$\\textbf{D}$ under the\nguidance of highly activated points borrowed from similarity maps. Remarkably,\nREAD features an intuitive design, Similarity as Points module (SasP), which\ncan be seamlessly applied to $\\texttt{}$-like paradigms in a plug-and-play\nfashion. Also, extensive experiments have been conducted on ReasonSeg and\nRefCOCO(+/g) datasets. To validate whether READ suffers from catastrophic\nforgetting of previous skills after fine-tuning, we further assess its\ngeneration ability on an augmented FP-RefCOCO(+/g) dataset. All codes and\nmodels are publicly available at https://github.com/rui-qian/READ.\n","authors":["Rui Qian","Xin Yin","Dejing Dou"],"pdf_url":"https://arxiv.org/pdf/2412.17741v5.pdf","comment":"This work has been accepted to CVPR 2025, please refer to\n https://github.com/rui-qian/READ"},{"id":"http://arxiv.org/abs/2405.15683v3","updated":"2025-03-06T03:59:59Z","published":"2024-05-24T16:21:59Z","title":"Visual Description Grounding Reduces Hallucinations and Boosts Reasoning\n in LVLMs","summary":" Large Vision-Language Models (LVLMs) often produce responses that misalign\nwith factual information, a phenomenon known as hallucinations. While\nhallucinations are well-studied, the exact causes behind them remain\nunderexplored. In this paper, we first investigate the root causes of\nhallucinations in LVLMs. Our findings reveal that existing mitigation\ntechniques primarily reduce hallucinations for visual recognition prompts-those\nthat require simple descriptions of visual elements-but fail for cognitive\nprompts that demand deliberate reasoning. We identify the core issue as a lack\nof true visual perception in LVLMs: although they can accurately recognize\nvisual elements, they struggle to fully interpret these elements in the context\nof the input prompt and effectively link this recognition to their internal\nknowledge, which is critical for reasoning. To address this gap, we introduce\nVisual Description Grounded Decoding (VDGD), a simple, robust, and\ntraining-free method designed to enhance visual perception and improve\nreasoning capabilities in LVLMs. VDGD works by first generating a detailed\ndescription of the image and appending it as a prefix to the instruction.\nDuring response generation, tokens are sampled based on their KL divergence to\nthe description, favoring candidates with lower divergence. Experimental\nresults on multiple visual reasoning benchmarks and LVLMs demonstrate that VDGD\nconsistently outperforms existing baselines 2% - 33%. Finally, we introduce\nVaLLu, a benchmark designed for comprehensive evaluation of the cognitive\ncapabilities of LVLMs.\n","authors":["Sreyan Ghosh","Chandra Kiran Reddy Evuru","Sonal Kumar","Utkarsh Tyagi","Oriol Nieto","Zeyu Jin","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2405.15683v3.pdf","comment":"Accepted to ICLR 2025. Project: https://sreyan88.github.io/VDGD/"},{"id":"http://arxiv.org/abs/2502.16445v3","updated":"2025-03-06T03:55:58Z","published":"2025-02-23T05:08:06Z","title":"Iterative Flow Matching -- Path Correction and Gradual Refinement for\n Enhanced Generative Modeling","summary":" Generative models for image generation are now commonly used for a wide\nvariety of applications, ranging from guided image generation for entertainment\nto solving inverse problems. Nonetheless, training a generator is a non-trivial\nfeat that requires fine-tuning and can lead to so-called hallucinations, that\nis, the generation of images that are unrealistic. In this work, we explore\nimage generation using flow matching. We explain and demonstrate why flow\nmatching can generate hallucinations, and propose an iterative process to\nimprove the generation process. Our iterative process can be integrated into\nvirtually $\\textit{any}$ generative modeling technique, thereby enhancing the\nperformance and robustness of image synthesis systems.\n","authors":["Eldad Haber","Shadab Ahamed","Md. Shahriar Rahim Siddiqui","Niloufar Zakariaei","Moshe Eliasof"],"pdf_url":"https://arxiv.org/pdf/2502.16445v3.pdf","comment":"17 pages, 8 figures"},{"id":"http://arxiv.org/abs/2503.04067v1","updated":"2025-03-06T03:52:46Z","published":"2025-03-06T03:52:46Z","title":"FREAK: Frequency-modulated High-fidelity and Real-time Audio-driven\n Talking Portrait Synthesis","summary":" Achieving high-fidelity lip-speech synchronization in audio-driven talking\nportrait synthesis remains challenging. While multi-stage pipelines or\ndiffusion models yield high-quality results, they suffer from high\ncomputational costs. Some approaches perform well on specific individuals with\nlow resources, yet still exhibit mismatched lip movements. The aforementioned\nmethods are modeled in the pixel domain. We observed that there are noticeable\ndiscrepancies in the frequency domain between the synthesized talking videos\nand natural videos. Currently, no research on talking portrait synthesis has\nconsidered this aspect. To address this, we propose a FREquency-modulated,\nhigh-fidelity, and real-time Audio-driven talKing portrait synthesis framework,\nnamed FREAK, which models talking portraits from the frequency domain\nperspective, enhancing the fidelity and naturalness of the synthesized\nportraits. FREAK introduces two novel frequency-based modules: 1) the Visual\nEncoding Frequency Modulator (VEFM) to couple multi-scale visual features in\nthe frequency domain, better preserving visual frequency information and\nreducing the gap in the frequency spectrum between synthesized and natural\nframes. and 2) the Audio Visual Frequency Modulator (AVFM) to help the model\nlearn the talking pattern in the frequency domain and improve audio-visual\nsynchronization. Additionally, we optimize the model in both pixel domain and\nfrequency domain jointly. Furthermore, FREAK supports seamless switching\nbetween one-shot and video dubbing settings, offering enhanced flexibility. Due\nto its superior performance, it can simultaneously support high-resolution\nvideo results and real-time inference. Extensive experiments demonstrate that\nour method synthesizes high-fidelity talking portraits with detailed facial\ntextures and precise lip synchronization in real-time, outperforming\nstate-of-the-art methods.\n","authors":["Ziqi Ni","Ao Fu","Yi Zhou"],"pdf_url":"https://arxiv.org/pdf/2503.04067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.00741v2","updated":"2025-03-06T03:44:10Z","published":"2025-03-02T05:36:04Z","title":"LesionDiffusion: Towards Text-controlled General Lesion Synthesis","summary":" Fully-supervised lesion recognition methods in medical imaging face\nchallenges due to the reliance on large annotated datasets, which are expensive\nand difficult to collect. To address this, synthetic lesion generation has\nbecome a promising approach. However, existing models struggle with\nscalability, fine-grained control over lesion attributes, and the generation of\ncomplex structures. We propose LesionDiffusion, a text-controllable lesion\nsynthesis framework for 3D CT imaging that generates both lesions and\ncorresponding masks. By utilizing a structured lesion report template, our\nmodel provides greater control over lesion attributes and supports a wider\nvariety of lesion types. We introduce a dataset of 1,505 annotated CT scans\nwith paired lesion masks and structured reports, covering 14 lesion types\nacross 8 organs. LesionDiffusion consists of two components: a lesion mask\nsynthesis network (LMNet) and a lesion inpainting network (LINet), both guided\nby lesion attributes and image features. Extensive experiments demonstrate that\nLesionDiffusion significantly improves segmentation performance, with strong\ngeneralization to unseen lesion types and organs, outperforming current\nstate-of-the-art models. Code will be available at\nhttps://github.com/HengruiTianSJTU/LesionDiffusion.\n","authors":["Henrui Tian","Wenhui Lei","Linrui Dai","Hanyu Chen","Xiaofan Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.00741v2.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2503.04065v1","updated":"2025-03-06T03:43:21Z","published":"2025-03-06T03:43:21Z","title":"PP-DocBee: Improving Multimodal Document Understanding Through a Bag of\n Tricks","summary":" With the rapid advancement of digitalization, various document images are\nbeing applied more extensively in production and daily life, and there is an\nincreasingly urgent need for fast and accurate parsing of the content in\ndocument images. Therefore, this report presents PP-DocBee, a novel multimodal\nlarge language model designed for end-to-end document image understanding.\nFirst, we develop a data synthesis strategy tailored to document scenarios in\nwhich we build a diverse dataset to improve the model generalization. Then, we\napply a few training techniques, including dynamic proportional sampling, data\npreprocessing, and OCR postprocessing strategies. Extensive evaluations\ndemonstrate the superior performance of PP-DocBee, achieving state-of-the-art\nresults on English document understanding benchmarks and even outperforming\nexisting open source and commercial models in Chinese document understanding.\nThe source code and pre-trained models are publicly available at\n\\href{https://github.com/PaddlePaddle/PaddleMIX}{https://github.com/PaddlePaddle/PaddleMIX}.\n","authors":["Feng Ni","Kui Huang","Yao Lu","Wenyu Lv","Guanzhong Wang","Zeyu Chen","Yi Liu"],"pdf_url":"https://arxiv.org/pdf/2503.04065v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.00736v2","updated":"2025-03-06T03:35:09Z","published":"2025-03-02T05:20:41Z","title":"Shazam: Unifying Multiple Foundation Models for Advanced Computational\n Pathology","summary":" Foundation Models (FMs) in computational pathology (CPath) have significantly\nadvanced the extraction of meaningful features from histopathology image\ndatasets, achieving strong performance across various clinical tasks. Despite\ntheir impressive performance, these models often exhibit variability when\napplied to different tasks, prompting the need for a unified framework capable\nof consistently excelling across various applications. In this work, we propose\nShazam, a novel framework designed to efficiently combine multiple CPath\nmodels. Unlike previous approaches that train a fixed-parameter FM, Shazam\ndynamically extracts and refines information from diverse FMs for each specific\ntask. To ensure that each FM contributes effectively without dominance, a novel\ndistillation strategy is applied, guiding the student model with features from\nall teacher models, which enhances its generalization ability. Experimental\nresults on two pathology patch classification datasets demonstrate that Shazam\noutperforms existing CPath models and other fusion methods. Its lightweight,\nflexible design makes it a promising solution for improving CPath analysis in\nreal-world settings. Code will be available at\nhttps://github.com/Tuner12/Shazam.\n","authors":["Wenhui Lei","Anqi Li","Yusheng Tan","Hanyu Chen","Xiaofan Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.00736v2.pdf","comment":"9 pages, 2 figures"},{"id":"http://arxiv.org/abs/2503.03190v2","updated":"2025-03-06T03:32:56Z","published":"2025-03-05T05:13:53Z","title":"DSPNet: Dual-vision Scene Perception for Robust 3D Question Answering","summary":" 3D Question Answering (3D QA) requires the model to comprehensively\nunderstand its situated 3D scene described by the text, then reason about its\nsurrounding environment and answer a question under that situation. However,\nexisting methods usually rely on global scene perception from pure 3D point\nclouds and overlook the importance of rich local texture details from\nmulti-view images. Moreover, due to the inherent noise in camera poses and\ncomplex occlusions, there exists significant feature degradation and reduced\nfeature robustness problems when aligning 3D point cloud with multi-view\nimages. In this paper, we propose a Dual-vision Scene Perception Network\n(DSPNet), to comprehensively integrate multi-view and point cloud features to\nimprove robustness in 3D QA. Our Text-guided Multi-view Fusion (TGMF) module\nprioritizes image views that closely match the semantic content of the text. To\nadaptively fuse back-projected multi-view images with point cloud features, we\ndesign the Adaptive Dual-vision Perception (ADVP) module, enhancing 3D scene\ncomprehension. Additionally, our Multimodal Context-guided Reasoning (MCGR)\nmodule facilitates robust reasoning by integrating contextual information\nacross visual and linguistic modalities. Experimental results on SQA3D and\nScanQA datasets demonstrate the superiority of our DSPNet. Codes will be\navailable at https://github.com/LZ-CH/DSPNet.\n","authors":["Jingzhou Luo","Yang Liu","Weixing Chen","Zhen Li","Yaowei Wang","Guanbin Li","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2503.03190v2.pdf","comment":"Accepted by CVPR 2025"},{"id":"http://arxiv.org/abs/2410.21259v4","updated":"2025-03-06T03:31:32Z","published":"2024-10-28T17:55:08Z","title":"AutoBench-V: Can Large Vision-Language Models Benchmark Themselves?","summary":" Large Vision-Language Models (LVLMs) have become essential for advancing the\nintegration of visual and linguistic information. However, the evaluation of\nLVLMs presents significant challenges as the evaluation benchmark always\ndemands lots of human cost for its construction, and remains static, lacking\nflexibility once constructed. Even though automatic evaluation has been\nexplored in textual modality, the visual modality remains under-explored. As a\nresult, in this work, we address a question: \"Can LVLMs themselves be used to\nbenchmark each other in the visual automatically domain?\". We introduce\nAutoBench-V, an automated framework for serving evaluation on demand, i.e.,\nbenchmarking LVLMs based on specific aspects of model capability. AutoBench-V\nleverages text-to-image models to generate relevant image samples and then\nutilizes LVLMs to orchestrate visual question-answering (VQA) tasks, completing\nthe evaluation process efficiently and flexibly. Through an extensive\nevaluation of nine popular LVLMs across five demanded user inputs (i.e.,\nevaluation capabilities), the framework shows effectiveness and reliability.\n","authors":["Han Bao","Yue Huang","Yanbo Wang","Jiayi Ye","Xiangqi Wang","Xiuying Chen","Yue Zhao","Tianyi Zhou","Mohamed Elhoseiny","Xiangliang Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.21259v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04059v1","updated":"2025-03-06T03:27:14Z","published":"2025-03-06T03:27:14Z","title":"H3O: Hyper-Efficient 3D Occupancy Prediction with Heterogeneous\n Supervision","summary":" 3D occupancy prediction has recently emerged as a new paradigm for holistic\n3D scene understanding and provides valuable information for downstream\nplanning in autonomous driving. Most existing methods, however, are\ncomputationally expensive, requiring costly attention-based 2D-3D\ntransformation and 3D feature processing. In this paper, we present a novel 3D\noccupancy prediction approach, H3O, which features highly efficient\narchitecture designs that incur a significantly lower computational cost as\ncompared to the current state-of-the-art methods. In addition, to compensate\nfor the ambiguity in ground-truth 3D occupancy labels, we advocate leveraging\nauxiliary tasks to complement the direct 3D supervision. In particular, we\nintegrate multi-camera depth estimation, semantic segmentation, and surface\nnormal estimation via differentiable volume rendering, supervised by\ncorresponding 2D labels that introduces rich and heterogeneous supervision\nsignals. We conduct extensive experiments on the Occ3D-nuScenes and\nSemanticKITTI benchmarks that demonstrate the superiority of our proposed H3O.\n","authors":["Yunxiao Shi","Hong Cai","Amin Ansari","Fatih Porikli"],"pdf_url":"https://arxiv.org/pdf/2503.04059v1.pdf","comment":"ICRA 2025"}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2503.04725v1","updated":"2025-03-06T18:59:48Z","published":"2025-03-06T18:59:48Z","title":"L$^2$M: Mutual Information Scaling Law for Long-Context Language\n Modeling","summary":" We rigorously establish a bipartite mutual information scaling law in natural\nlanguage that governs long-range dependencies. This scaling law, which we show\nis distinct from and scales independently of the conventional two-point mutual\ninformation, is the key to understanding long-context language modeling. Using\nthis scaling law, we formulate the Long-context Language Modeling (L$^2$M)\ncondition, which relates a model's capacity for effective long context length\nmodeling to the scaling of its latent state size for storing past information.\nOur results are validated through experiments on both transformers and state\nspace models. This work establishes a theoretical foundation that guides the\ndevelopment of large language models toward longer context lengths.\n","authors":["Zhuo Chen","Oriol Mayné i Comas","Zhuotao Jin","Di Luo","Marin Soljačić"],"pdf_url":"https://arxiv.org/pdf/2503.04725v1.pdf","comment":"29 pages, 12 figures, 1 table"},{"id":"http://arxiv.org/abs/2503.04723v1","updated":"2025-03-06T18:59:37Z","published":"2025-03-06T18:59:37Z","title":"Shifting Long-Context LLMs Research from Input to Output","summary":" Recent advancements in long-context Large Language Models (LLMs) have\nprimarily concentrated on processing extended input contexts, resulting in\nsignificant strides in long-context comprehension. However, the equally\ncritical aspect of generating long-form outputs has received comparatively less\nattention. This paper advocates for a paradigm shift in NLP research toward\naddressing the challenges of long-output generation. Tasks such as novel\nwriting, long-term planning, and complex reasoning require models to understand\nextensive contexts and produce coherent, contextually rich, and logically\nconsistent extended text. These demands highlight a critical gap in current LLM\ncapabilities. We underscore the importance of this under-explored domain and\ncall for focused efforts to develop foundational LLMs tailored for generating\nhigh-quality, long-form outputs, which hold immense potential for real-world\napplications.\n","authors":["Yuhao Wu","Yushi Bai","Zhiqing Hu","Shangqing Tu","Ming Shan Hee","Juanzi Li","Roy Ka-Wei Lee"],"pdf_url":"https://arxiv.org/pdf/2503.04723v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2503.04722v1","updated":"2025-03-06T18:59:23Z","published":"2025-03-06T18:59:23Z","title":"Enough Coin Flips Can Make LLMs Act Bayesian","summary":" Large language models (LLMs) exhibit the ability to generalize given few-shot\nexamples in their input prompt, an emergent capability known as in-context\nlearning (ICL). We investigate whether LLMs utilize ICL to perform structured\nreasoning in ways that are consistent with a Bayesian framework or rely on\npattern matching. Using a controlled setting of biased coin flips, we find\nthat: (1) LLMs often possess biased priors, causing initial divergence in\nzero-shot settings, (2) in-context evidence outweighs explicit bias\ninstructions, (3) LLMs broadly follow Bayesian posterior updates, with\ndeviations primarily due to miscalibrated priors rather than flawed updates,\nand (4) attention magnitude has negligible effect on Bayesian inference. With\nsufficient demonstrations of biased coin flips via ICL, LLMs update their\npriors in a Bayesian manner.\n","authors":["Ritwik Gupta","Rodolfo Corona","Jiaxin Ge","Eric Wang","Dan Klein","Trevor Darrell","David M. Chan"],"pdf_url":"https://arxiv.org/pdf/2503.04722v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04715v1","updated":"2025-03-06T18:58:29Z","published":"2025-03-06T18:58:29Z","title":"Predictable Scale: Part I -- Optimal Hyperparameter Scaling Law in Large\n Language Model Pretraining","summary":" The impressive capabilities of Large Language Models (LLMs) across diverse\ntasks are now well-established, yet their effective deployment necessitates\ncareful hyperparameter optimization. Through extensive empirical studies\ninvolving grid searches across diverse configurations, we discover universal\nscaling laws governing these hyperparameters: optimal learning rate follows a\npower-law relationship with both model parameters and data sizes, while optimal\nbatch size scales primarily with data sizes. Our analysis reveals a convex\noptimization landscape for hyperparameters under fixed models and data size\nconditions. This convexity implies an optimal hyperparameter plateau. We\ncontribute a universal, plug-and-play optimal hyperparameter tool for the\ncommunity. Its estimated values on the test set are merely 0.07\\% away from the\nglobally optimal LLM performance found via an exhaustive search. These laws\ndemonstrate remarkable robustness across variations in model sparsity, training\ndata distribution, and model shape. To our best known, this is the first work\nthat unifies different model shapes and structures, such as Mixture-of-Experts\nmodels and dense transformers, as well as establishes optimal hyperparameter\nscaling laws across diverse data distributions. This exhaustive optimization\nprocess demands substantial computational resources, utilizing nearly one\nmillion NVIDIA H800 GPU hours to train 3,700 LLMs of varying sizes and\nhyperparameters from scratch and consuming approximately 100 trillion tokens in\ntotal. To facilitate reproducibility and further research, we will\nprogressively release all loss measurements and model checkpoints through our\ndesignated repository https://step-law.github.io/\n","authors":["Houyi Li","Wenzheng Zheng","Jingcheng Hu","Qiufeng Wang","Hanshan Zhang","Zili Wang","Yangshijie Xu","Shuigeng Zhou","Xiangyu Zhang","Daxin Jiang"],"pdf_url":"https://arxiv.org/pdf/2503.04715v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2403.11807v7","updated":"2025-03-06T18:58:23Z","published":"2024-03-18T14:04:47Z","title":"How Far Are We on the Decision-Making of LLMs? Evaluating LLMs' Gaming\n Ability in Multi-Agent Environments","summary":" Decision-making is a complex process requiring diverse abilities, making it\nan excellent framework for evaluating Large Language Models (LLMs). Researchers\nhave examined LLMs' decision-making through the lens of Game Theory. However,\nexisting evaluation mainly focus on two-player scenarios where an LLM competes\nagainst another. Additionally, previous benchmarks suffer from test set leakage\ndue to their static design. We introduce GAMA($\\gamma$)-Bench, a new framework\nfor evaluating LLMs' Gaming Ability in Multi-Agent environments. It includes\neight classical game theory scenarios and a dynamic scoring scheme specially\ndesigned to quantitatively assess LLMs' performance. $\\gamma$-Bench allows\nflexible game settings and adapts the scoring system to different game\nparameters, enabling comprehensive evaluation of robustness, generalizability,\nand strategies for improvement. Our results indicate that GPT-3.5 demonstrates\nstrong robustness but limited generalizability, which can be enhanced using\nmethods like Chain-of-Thought. We also evaluate 13 LLMs from 6 model families,\nincluding GPT-3.5, GPT-4, Gemini, LLaMA-3.1, Mixtral, and Qwen-2.\nGemini-1.5-Pro outperforms others, scoring of $69.8$ out of $100$, followed by\nLLaMA-3.1-70B ($65.9$) and Mixtral-8x22B ($62.4$). Our code and experimental\nresults are publicly available at https://github.com/CUHK-ARISE/GAMABench.\n","authors":["Jen-tse Huang","Eric John Li","Man Ho Lam","Tian Liang","Wenxuan Wang","Youliang Yuan","Wenxiang Jiao","Xing Wang","Zhaopeng Tu","Michael R. Lyu"],"pdf_url":"https://arxiv.org/pdf/2403.11807v7.pdf","comment":"Accepted to ICLR 2025; 11 pages of main text; 26 pages of appendices;\n Included models: GPT-3.5-{0613, 1106, 0125}, GPT-4-0125, GPT-4o-0806,\n Gemini-{1.0, 1.5)-Pro, LLaMA-3.1-{7, 70, 405}B, Mixtral-8x{7, 22}B,\n Qwen-2-72B"},{"id":"http://arxiv.org/abs/2503.04713v1","updated":"2025-03-06T18:57:40Z","published":"2025-03-06T18:57:40Z","title":"Scaling Rich Style-Prompted Text-to-Speech Datasets","summary":" We introduce Paralinguistic Speech Captions (ParaSpeechCaps), a large-scale\ndataset that annotates speech utterances with rich style captions. While rich\nabstract tags (e.g. guttural, nasal, pained) have been explored in small-scale\nhuman-annotated datasets, existing large-scale datasets only cover basic tags\n(e.g. low-pitched, slow, loud). We combine off-the-shelf text and speech\nembedders, classifiers and an audio language model to automatically scale rich\ntag annotations for the first time. ParaSpeechCaps covers a total of 59 style\ntags, including both speaker-level intrinsic tags and utterance-level\nsituational tags. It consists of 342 hours of human-labelled data (PSC-Base)\nand 2427 hours of automatically annotated data (PSC-Scaled). We finetune\nParler-TTS, an open-source style-prompted TTS model, on ParaSpeechCaps, and\nachieve improved style consistency (+7.9% Consistency MOS) and speech quality\n(+15.5% Naturalness MOS) over the best performing baseline that combines\nexisting rich style tag datasets. We ablate several of our dataset design\nchoices to lay the foundation for future work in this space. Our dataset,\nmodels and code are released at https://github.com/ajd12342/paraspeechcaps .\n","authors":["Anuj Diwan","Zhisheng Zheng","David Harwath","Eunsol Choi"],"pdf_url":"https://arxiv.org/pdf/2503.04713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04710v1","updated":"2025-03-06T18:57:16Z","published":"2025-03-06T18:57:16Z","title":"Self-Supervised Models for Phoneme Recognition: Applications in\n Children's Speech for Reading Learning","summary":" Child speech recognition is still an underdeveloped area of research due to\nthe lack of data (especially on non-English languages) and the specific\ndifficulties of this task. Having explored various architectures for child\nspeech recognition in previous work, in this article we tackle recent\nself-supervised models. We first compare wav2vec 2.0, HuBERT and WavLM models\nadapted to phoneme recognition in French child speech, and continue our\nexperiments with the best of them, WavLM base+. We then further adapt it by\nunfreezing its transformer blocks during fine-tuning on child speech, which\ngreatly improves its performance and makes it significantly outperform our base\nmodel, a Transformer+CTC. Finally, we study in detail the behaviour of these\ntwo models under the real conditions of our application, and show that WavLM\nbase+ is more robust to various reading tasks and noise levels. Index Terms:\nspeech recognition, child speech, self-supervised learning\n","authors":["Lucas Block Medin","Thomas Pellegrini","Lucile Gelin"],"pdf_url":"https://arxiv.org/pdf/2503.04710v1.pdf","comment":"This paper was originally published in the Proceedings of Interspeech\n 2024. DOI: 10.21437/Interspeech.2024-1095"},{"id":"http://arxiv.org/abs/2503.04704v1","updated":"2025-03-06T18:54:32Z","published":"2025-03-06T18:54:32Z","title":"Universality of Layer-Level Entropy-Weighted Quantization Beyond Model\n Architecture and Size","summary":" We present a novel approach to selective model quantization that transcends\nthe limitations of architecture-specific and size-dependent compression methods\nfor Large Language Models (LLMs) using Entropy-Weighted Quantization (EWQ). By\nanalyzing the entropy distribution across transformer blocks, EWQ determines\nwhich blocks can be safely quantized without causing significant performance\ndegradation, independent of model architecture or size. Our method outperforms\nuniform quantization approaches, maintaining Massive Multitask Language\nUnderstanding (MMLU) accuracy scores within 0.5% of unquantized models while\nreducing memory usage by up to 18%. We demonstrate the effectiveness of EWQ\nacross multiple architectures-from 1.6B to 70B parameters-showcasing consistent\nimprovements in the quality-compression trade-off regardless of model scale or\narchitectural design. A surprising finding of EWQ is its ability to reduce\nperplexity compared to unquantized models, suggesting the presence of\nbeneficial regularization through selective precision reduction. This\nimprovement holds across different model families, indicating a fundamental\nrelationship between layer-level entropy and optimal precision requirements.\nAdditionally, we introduce FastEWQ, a rapid method for entropy distribution\nanalysis that eliminates the need for loading model weights. This technique\nleverages universal characteristics of entropy distribution that persist across\nvarious architectures and scales, enabling near-instantaneous quantization\ndecisions while maintaining 80% classification accuracy with full entropy\nanalysis. Our results demonstrate that effective quantization strategies can be\ndeveloped independently of specific architectural choices or model sizes,\nopening new possibilities for efficient LLM deployment.\n","authors":["Alireza Behtash","Marijan Fofonjka","Ethan Baird","Tyler Mauer","Hossein Moghimifam","David Stout","Joel Dennison"],"pdf_url":"https://arxiv.org/pdf/2503.04704v1.pdf","comment":"29 pages, 7 figures, 14 tables; Comments are welcome"},{"id":"http://arxiv.org/abs/2502.15037v4","updated":"2025-03-06T18:50:30Z","published":"2025-02-20T20:46:09Z","title":"DEFT: Differentiable Branched Discrete Elastic Rods for Modeling\n Furcated DLOs in Real-Time","summary":" Autonomous wire harness assembly requires robots to manipulate complex\nbranched cables with high precision and reliability. A key challenge in\nautomating this process is predicting how these flexible and branched\nstructures behave under manipulation. Without accurate predictions, it is\ndifficult for robots to reliably plan or execute assembly operations. While\nexisting research has made progress in modeling single-threaded Deformable\nLinear Objects (DLOs), extending these approaches to Branched Deformable Linear\nObjects (BDLOs) presents fundamental challenges. The junction points in BDLOs\ncreate complex force interactions and strain propagation patterns that cannot\nbe adequately captured by simply connecting multiple single-DLO models. To\naddress these challenges, this paper presents Differentiable discrete branched\nElastic rods for modeling Furcated DLOs in real-Time (DEFT), a novel framework\nthat combines a differentiable physics-based model with a learning framework\nto: 1) accurately model BDLO dynamics, including dynamic propagation at\njunction points and grasping in the middle of a BDLO, 2) achieve efficient\ncomputation for real-time inference, and 3) enable planning to demonstrate\ndexterous BDLO manipulation. A comprehensive series of real-world experiments\ndemonstrates DEFT's efficacy in terms of accuracy, computational speed, and\ngeneralizability compared to state-of-the-art alternatives. Project\npage:https://roahmlab.github.io/DEFT/.\n","authors":["Yizhou Chen","Xiaoyue Wu","Yeheng Zong","Anran Li","Yuzhen Chen","Julie Wu","Bohao Zhang","Ram Vasudevan"],"pdf_url":"https://arxiv.org/pdf/2502.15037v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.02784v2","updated":"2025-03-06T18:45:51Z","published":"2025-03-04T16:57:53Z","title":"Do Not Trust Licenses You See -- Dataset Compliance Requires\n Massive-Scale AI-Powered Lifecycle Tracing","summary":" This paper argues that a dataset's legal risk cannot be accurately assessed\nby its license terms alone; instead, tracking dataset redistribution and its\nfull lifecycle is essential. However, this process is too complex for legal\nexperts to handle manually at scale. Tracking dataset provenance, verifying\nredistribution rights, and assessing evolving legal risks across multiple\nstages require a level of precision and efficiency that exceeds human\ncapabilities. Addressing this challenge effectively demands AI agents that can\nsystematically trace dataset redistribution, analyze compliance, and identify\nlegal risks. We develop an automated data compliance system called NEXUS and\nshow that AI can perform these tasks with higher accuracy, efficiency, and\ncost-effectiveness than human experts. Our massive legal analysis of 17,429\nunique entities and 8,072 license terms using this approach reveals the\ndiscrepancies in legal rights between the original datasets before\nredistribution and their redistributed subsets, underscoring the necessity of\nthe data lifecycle-aware compliance. For instance, we find that out of 2,852\ndatasets with commercially viable individual license terms, only 605 (21%) are\nlegally permissible for commercialization. This work sets a new standard for AI\ndata governance, advocating for a framework that systematically examines the\nentire lifecycle of dataset redistribution to ensure transparent, legal, and\nresponsible dataset management.\n","authors":["Jaekyeom Kim","Sungryull Sohn","Gerrard Jeongwon Jo","Jihoon Choi","Kyunghoon Bae","Hwayoung Lee","Yongmin Park","Honglak Lee"],"pdf_url":"https://arxiv.org/pdf/2503.02784v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04697v1","updated":"2025-03-06T18:43:29Z","published":"2025-03-06T18:43:29Z","title":"L1: Controlling How Long A Reasoning Model Thinks With Reinforcement\n Learning","summary":" Reasoning language models have shown an uncanny ability to improve\nperformance at test-time by ``thinking longer''-that is, by generating longer\nchain-of-thought sequences and hence using more compute. However, the length of\ntheir chain-of-thought reasoning is not controllable, making it impossible to\nallocate test-time compute to achieve a desired level of performance. We\nintroduce Length Controlled Policy Optimization (LCPO), a simple reinforcement\nlearning method that optimizes for accuracy and adherence to user-specified\nlength constraints. We use LCPO to train L1, a reasoning language model that\nproduces outputs satisfying a length constraint given in its prompt. L1's\nlength control allows for smoothly trading off computational cost and accuracy\non a wide range of tasks, and outperforms the state-of-the-art S1 method for\nlength control. Furthermore, we uncover an unexpected short chain-of-thought\ncapability in models trained with LCPO. For instance, our 1.5B L1 model\nsurpasses GPT-4o at equal reasoning lengths. Overall, LCPO enables precise\ncontrol over reasoning length, allowing for fine-grained allocation of\ntest-time compute and accuracy. We release code and models at\nhttps://www.cmu-l3.github.io/l1\n","authors":["Pranjal Aggarwal","Sean Welleck"],"pdf_url":"https://arxiv.org/pdf/2503.04697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02694v3","updated":"2025-03-06T18:41:54Z","published":"2024-10-03T17:20:11Z","title":"HELMET: How to Evaluate Long-Context Language Models Effectively and\n Thoroughly","summary":" Many benchmarks exist for evaluating long-context language models (LCLMs),\nyet developers often rely on synthetic tasks such as needle-in-a-haystack\n(NIAH) or an arbitrary subset of tasks. However, it remains unclear whether\nthese benchmarks reflect the diverse downstream applications of LCLMs, and such\ninconsistencies further complicate model comparison. We investigate the\nunderlying reasons behind these practices and find that existing benchmarks\noften provide noisy signals due to limited coverage of applications,\ninsufficient context lengths, unreliable metrics, and incompatibility with base\nmodels. In this work, we introduce HELMET (How to Evaluate Long-context Models\nEffectively and Thoroughly), a comprehensive benchmark encompassing seven\ndiverse, application-centric categories. We also address several issues in\nprevious benchmarks by adding controllable lengths up to 128K tokens,\nmodel-based evaluation for reliable metrics, and few-shot prompting for\nrobustly evaluating base models. Consequently, we demonstrate that HELMET\noffers more reliable and consistent rankings of frontier LCLMs. Through a\ncomprehensive study of 59 LCLMs, we find that (1) synthetic tasks like NIAH do\nnot reliably predict downstream performance; (2) the diverse categories in\nHELMET exhibit distinct trends and low correlations with each other; and (3)\nwhile most LCLMs achieve perfect NIAH scores, open-source models significantly\nlag behind closed ones when tasks require full-context reasoning or following\ncomplex instructions -- the gap widens as length increases. Finally, we\nrecommend using our RAG tasks for fast model development, as they are easy to\nrun and better predict other downstream performance; ultimately, we advocate\nfor a holistic evaluation across diverse tasks.\n","authors":["Howard Yen","Tianyu Gao","Minmin Hou","Ke Ding","Daniel Fleischer","Peter Izsak","Moshe Wasserblat","Danqi Chen"],"pdf_url":"https://arxiv.org/pdf/2410.02694v3.pdf","comment":"ICLR 2025. Project page: https://princeton-nlp.github.io/HELMET/"},{"id":"http://arxiv.org/abs/2503.04680v1","updated":"2025-03-06T18:22:46Z","published":"2025-03-06T18:22:46Z","title":"Matrix Factorization for Inferring Associations and Missing Links","summary":" Missing link prediction is a method for network analysis, with applications\nin recommender systems, biology, social sciences, cybersecurity, information\nretrieval, and Artificial Intelligence (AI) reasoning in Knowledge Graphs.\nMissing link prediction identifies unseen but potentially existing connections\nin a network by analyzing the observed patterns and relationships. In\nproliferation detection, this supports efforts to identify and characterize\nattempts by state and non-state actors to acquire nuclear weapons or associated\ntechnology - a notoriously challenging but vital mission for global security.\nDimensionality reduction techniques like Non-Negative Matrix Factorization\n(NMF) and Logistic Matrix Factorization (LMF) are effective but require\nselection of the matrix rank parameter, that is, of the number of hidden\nfeatures, k, to avoid over/under-fitting. We introduce novel Weighted (WNMFk),\nBoolean (BNMFk), and Recommender (RNMFk) matrix factorization methods, along\nwith ensemble variants incorporating logistic factorization, for link\nprediction. Our methods integrate automatic model determination for rank\nestimation by evaluating stability and accuracy using a modified bootstrap\nmethodology and uncertainty quantification (UQ), assessing prediction\nreliability under random perturbations. We incorporate Otsu threshold selection\nand k-means clustering for Boolean matrix factorization, comparing them to\ncoordinate descent-based Boolean thresholding. Our experiments highlight the\nimpact of rank k selection, evaluate model performance under varying test-set\nsizes, and demonstrate the benefits of UQ for reliable predictions using\nabstention. We validate our methods on three synthetic datasets (Boolean and\nuniformly distributed) and benchmark them against LMF and symmetric LMF\n(symLMF) on five real-world protein-protein interaction networks, showcasing an\nimproved prediction performance.\n","authors":["Ryan Barron","Maksim E. Eren","Duc P. Truong","Cynthia Matuszek","James Wendelberger","Mary F. Dorn","Boian Alexandrov"],"pdf_url":"https://arxiv.org/pdf/2503.04680v1.pdf","comment":"35 pages, 14 figures, 3 tables, 1 algorithm"},{"id":"http://arxiv.org/abs/2503.04679v1","updated":"2025-03-06T18:22:29Z","published":"2025-03-06T18:22:29Z","title":"Multi-Agent Inverse Q-Learning from Demonstrations","summary":" When reward functions are hand-designed, deep reinforcement learning\nalgorithms often suffer from reward misspecification, causing them to learn\nsuboptimal policies in terms of the intended task objectives. In the\nsingle-agent case, inverse reinforcement learning (IRL) techniques attempt to\naddress this issue by inferring the reward function from expert demonstrations.\nHowever, in multi-agent problems, misalignment between the learned and true\nobjectives is exacerbated due to increased environment non-stationarity and\nvariance that scales with multiple agents. As such, in multi-agent general-sum\ngames, multi-agent IRL algorithms have difficulty balancing cooperative and\ncompetitive objectives. To address these issues, we propose Multi-Agent\nMarginal Q-Learning from Demonstrations (MAMQL), a novel sample-efficient\nframework for multi-agent IRL. For each agent, MAMQL learns a critic\nmarginalized over the other agents' policies, allowing for a well-motivated use\nof Boltzmann policies in the multi-agent context. We identify a connection\nbetween optimal marginalized critics and single-agent soft-Q IRL, allowing us\nto apply a direct, simple optimization criterion from the single-agent domain.\nAcross our experiments on three different simulated domains, MAMQL\nsignificantly outperforms previous multi-agent methods in average reward,\nsample efficiency, and reward recovery by often more than 2-5x. We make our\ncode available at https://sites.google.com/view/mamql .\n","authors":["Nathaniel Haynam","Adam Khoja","Dhruv Kumar","Vivek Myers","Erdem Bıyık"],"pdf_url":"https://arxiv.org/pdf/2503.04679v1.pdf","comment":"8 pages, 4 figures, 2 tables. Published at the International\n Conference on Robotics and Automation (ICRA) 2025"},{"id":"http://arxiv.org/abs/2502.02067v2","updated":"2025-03-06T18:09:38Z","published":"2025-02-04T07:32:39Z","title":"AdaptBot: Combining LLM with Knowledge Graphs and Human Input for\n Generic-to-Specific Task Decomposition and Knowledge Refinement","summary":" An embodied agent assisting humans is often asked to complete new tasks, and\nthere may not be sufficient time or labeled examples to train the agent to\nperform these new tasks. Large Language Models (LLMs) trained on considerable\nknowledge across many domains can be used to predict a sequence of abstract\nactions for completing such tasks, although the agent may not be able to\nexecute this sequence due to task-, agent-, or domain-specific constraints. Our\nframework addresses these challenges by leveraging the generic predictions\nprovided by LLM and the prior domain knowledge encoded in a Knowledge Graph\n(KG), enabling an agent to quickly adapt to new tasks. The robot also solicits\nand uses human input as needed to refine its existing knowledge. Based on\nexperimental evaluation in the context of cooking and cleaning tasks in\nsimulation domains, we demonstrate that the interplay between LLM, KG, and\nhuman input leads to substantial performance gains compared with just using the\nLLM. Project website{\\S}: https://sssshivvvv.github.io/adaptbot/\n","authors":["Shivam Singh","Karthik Swaminathan","Nabanita Dash","Ramandeep Singh","Snehasis Banerjee","Mohan Sridharan","Madhava Krishna"],"pdf_url":"https://arxiv.org/pdf/2502.02067v2.pdf","comment":"Accepted to IEEE International Conference on Robotics and Automation\n (ICRA) 2025"},{"id":"http://arxiv.org/abs/2502.12360v2","updated":"2025-03-06T18:07:00Z","published":"2025-02-17T22:50:45Z","title":"Detecting Systematic Weaknesses in Vision Models along Predefined\n Human-Understandable Dimensions","summary":" Slice discovery methods (SDMs) are prominent algorithms for finding\nsystematic weaknesses in DNNs. They identify top-k semantically coherent\nslices/subsets of data where a DNN-under-test has low performance. For being\ndirectly useful, slices should be aligned with human-understandable and\nrelevant dimensions, which, for example, are defined by safety and domain\nexperts as part of the operational design domain (ODD). While SDMs can be\napplied effectively on structured data, their application on image data is\ncomplicated by the lack of semantic metadata. To address these issues, we\npresent an algorithm that combines foundation models for zero-shot image\nclassification to generate semantic metadata with methods for combinatorial\nsearch to find systematic weaknesses in images. In contrast to existing\napproaches, ours identifies weak slices that are in line with pre-defined\nhuman-understandable dimensions. As the algorithm includes foundation models,\nits intermediate and final results may not always be exact. Therefore, we\ninclude an approach to address the impact of noisy metadata. We validate our\nalgorithm on both synthetic and real-world datasets, demonstrating its ability\nto recover human-understandable systematic weaknesses. Furthermore, using our\napproach, we identify systematic weaknesses of multiple pre-trained and\npublicly available state-of-the-art computer vision DNNs.\n","authors":["Sujan Sai Gannamaneni","Rohil Prakash Rao","Michael Mock","Maram Akila","Stefan Wrobel"],"pdf_url":"https://arxiv.org/pdf/2502.12360v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04873v2","updated":"2025-03-06T17:35:19Z","published":"2025-01-08T23:07:10Z","title":"Back Home: A Machine Learning Approach to Seashell Classification and\n Ecosystem Restoration","summary":" In Costa Rica, an average of 5 tons of seashells are extracted from\necosystems annually. Confiscated seashells, cannot be returned to their\necosystems due to the lack of origin recognition. To address this issue, we\ndeveloped a convolutional neural network (CNN) specifically for seashell\nidentification. We built a dataset from scratch, consisting of approximately\n19000 images from the Pacific and Caribbean coasts. Using this dataset, the\nmodel achieved a classification accuracy exceeding 85%. The model has been\nintegrated into a user-friendly application, which has classified over 36,000\nseashells to date, delivering real-time results within 3 seconds per image. To\nfurther enhance the system's accuracy, an anomaly detection mechanism was\nincorporated to filter out irrelevant or anomalous inputs, ensuring only valid\nseashell images are processed.\n","authors":["Alexander Valverde","Luis Solano"],"pdf_url":"https://arxiv.org/pdf/2501.04873v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04647v1","updated":"2025-03-06T17:33:01Z","published":"2025-03-06T17:33:01Z","title":"Implicit Cross-Lingual Rewarding for Efficient Multilingual Preference\n Alignment","summary":" Direct Preference Optimization (DPO) has become a prominent method for\naligning Large Language Models (LLMs) with human preferences. While DPO has\nenabled significant progress in aligning English LLMs, multilingual preference\nalignment is hampered by data scarcity. To address this, we propose a novel\napproach that $\\textit{captures}$ learned preferences from well-aligned English\nmodels by implicit rewards and $\\textit{transfers}$ them to other languages\nthrough iterative training. Specifically, we derive an implicit reward model\nfrom the logits of an English DPO-aligned model and its corresponding reference\nmodel. This reward model is then leveraged to annotate preference relations in\ncross-lingual instruction-following pairs, using English instructions to\nevaluate multilingual responses. The annotated data is subsequently used for\nmultilingual DPO fine-tuning, facilitating preference knowledge transfer from\nEnglish to other languages. Fine-tuning Llama3 for two iterations resulted in a\n12.72% average improvement in Win Rate and a 5.97% increase in Length Control\nWin Rate across all training languages on the X-AlpacaEval leaderboard. Our\nfindings demonstrate that leveraging existing English-aligned models can enable\nefficient and effective multilingual preference alignment, significantly\nreducing the need for extensive multilingual preference data. The code is\navailable at https://github.com/ZNLP/Implicit-Cross-Lingual-Rewarding\n","authors":["Wen Yang","Junhong Wu","Chen Wang","Chengqing Zong","Jiajun Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.04647v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2503.04641v1","updated":"2025-03-06T17:31:43Z","published":"2025-03-06T17:31:43Z","title":"Simulating the Real World: A Unified Survey of Multimodal Generative\n Models","summary":" Understanding and replicating the real world is a critical challenge in\nArtificial General Intelligence (AGI) research. To achieve this, many existing\napproaches, such as world models, aim to capture the fundamental principles\ngoverning the physical world, enabling more accurate simulations and meaningful\ninteractions. However, current methods often treat different modalities,\nincluding 2D (images), videos, 3D, and 4D representations, as independent\ndomains, overlooking their interdependencies. Additionally, these methods\ntypically focus on isolated dimensions of reality without systematically\nintegrating their connections. In this survey, we present a unified survey for\nmultimodal generative models that investigate the progression of data\ndimensionality in real-world simulation. Specifically, this survey starts from\n2D generation (appearance), then moves to video (appearance+dynamics) and 3D\ngeneration (appearance+geometry), and finally culminates in 4D generation that\nintegrate all dimensions. To the best of our knowledge, this is the first\nattempt to systematically unify the study of 2D, video, 3D and 4D generation\nwithin a single framework. To guide future research, we provide a comprehensive\nreview of datasets, evaluation metrics and future directions, and fostering\ninsights for newcomers. This survey serves as a bridge to advance the study of\nmultimodal generative models and real-world simulation within a unified\nframework.\n","authors":["Yuqi Hu","Longguang Wang","Xian Liu","Ling-Hao Chen","Yuwei Guo","Yukai Shi","Ce Liu","Anyi Rao","Zeyu Wang","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2503.04641v1.pdf","comment":"Repository for the related papers at\n https://github.com/ALEEEHU/World-Simulator"},{"id":"http://arxiv.org/abs/2202.00665v4","updated":"2025-03-06T17:24:46Z","published":"2022-02-01T18:58:33Z","title":"Tutorial on amortized optimization","summary":" Optimization is a ubiquitous modeling tool and is often deployed in settings\nwhich repeatedly solve similar instances of the same problem. Amortized\noptimization methods use learning to predict the solutions to problems in these\nsettings, exploiting the shared structure between similar problem instances.\nThese methods have been crucial in variational inference and reinforcement\nlearning and are capable of solving optimization problems many orders of\nmagnitudes times faster than traditional optimization methods that do not use\namortization. This tutorial presents an introduction to the amortized\noptimization foundations behind these advancements and overviews their\napplications in variational inference, sparse coding, gradient-based\nmeta-learning, control, reinforcement learning, convex optimization, optimal\ntransport, and deep equilibrium networks. The source code for this tutorial is\navailable at\nhttps://github.com/facebookresearch/amortized-optimization-tutorial.\n","authors":["Brandon Amos"],"pdf_url":"https://arxiv.org/pdf/2202.00665v4.pdf","comment":"Foundations and Trends in Machine Learning"},{"id":"http://arxiv.org/abs/2503.04636v1","updated":"2025-03-06T17:24:06Z","published":"2025-03-06T17:24:06Z","title":"Mark Your LLM: Detecting the Misuse of Open-Source Large Language Models\n via Watermarking","summary":" As open-source large language models (LLMs) like Llama3 become more capable,\nit is crucial to develop watermarking techniques to detect their potential\nmisuse. Existing watermarking methods either add watermarks during LLM\ninference, which is unsuitable for open-source LLMs, or primarily target\nclassification LLMs rather than recent generative LLMs. Adapting these\nwatermarks to open-source LLMs for misuse detection remains an open challenge.\nThis work defines two misuse scenarios for open-source LLMs: intellectual\nproperty (IP) violation and LLM Usage Violation. Then, we explore the\napplication of inference-time watermark distillation and backdoor watermarking\nin these contexts. We propose comprehensive evaluation methods to assess the\nimpact of various real-world further fine-tuning scenarios on watermarks and\nthe effect of these watermarks on LLM performance. Our experiments reveal that\nbackdoor watermarking could effectively detect IP Violation, while\ninference-time watermark distillation is applicable in both scenarios but less\nrobust to further fine-tuning and has a more significant impact on LLM\nperformance compared to backdoor watermarking. Exploring more advanced\nwatermarking methods for open-source LLMs to detect their misuse should be an\nimportant future direction.\n","authors":["Yijie Xu","Aiwei Liu","Xuming Hu","Lijie Wen","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2503.04636v1.pdf","comment":"Accepted by the 1st Workshop on GenAI Watermarking, collocated with\n ICLR 2025"},{"id":"http://arxiv.org/abs/2503.00897v3","updated":"2025-03-06T17:19:22Z","published":"2025-03-02T13:43:53Z","title":"A Simple and Effective Reinforcement Learning Method for Text-to-Image\n Diffusion Fine-tuning","summary":" Reinforcement learning (RL)-based fine-tuning has emerged as a powerful\napproach for aligning diffusion models with black-box objectives. Proximal\npolicy optimization (PPO) is the most popular choice of method for policy\noptimization. While effective in terms of performance, PPO is highly sensitive\nto hyper-parameters and involves substantial computational overhead. REINFORCE,\non the other hand, mitigates some computational complexities such as high\nmemory overhead and sensitive hyper-parameter tuning, but has suboptimal\nperformance due to high-variance and sample inefficiency. While the variance of\nthe REINFORCE can be reduced by sampling multiple actions per input prompt and\nusing a baseline correction term, it still suffers from sample inefficiency. To\naddress these challenges, we systematically analyze the\nefficiency-effectiveness trade-off between REINFORCE and PPO, and propose\nleave-one-out PPO (LOOP), a novel RL for diffusion fine-tuning method. LOOP\ncombines variance reduction techniques from REINFORCE, such as sampling\nmultiple actions per input prompt and a baseline correction term, with the\nrobustness and sample efficiency of PPO via clipping and importance sampling.\nOur results demonstrate that LOOP effectively improves diffusion models on\nvarious black-box objectives, and achieves a better balance between\ncomputational efficiency and performance.\n","authors":["Shashank Gupta","Chaitanya Ahuja","Tsung-Yu Lin","Sreya Dutta Roy","Harrie Oosterhuis","Maarten de Rijke","Satya Narayan Shukla"],"pdf_url":"https://arxiv.org/pdf/2503.00897v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04626v1","updated":"2025-03-06T17:12:46Z","published":"2025-03-06T17:12:46Z","title":"IDInit: A Universal and Stable Initialization Method for Neural Network\n Training","summary":" Deep neural networks have achieved remarkable accomplishments in practice.\nThe success of these networks hinges on effective initialization methods, which\nare vital for ensuring stable and rapid convergence during training. Recently,\ninitialization methods that maintain identity transition within layers have\nshown good efficiency in network training. These techniques (e.g., Fixup) set\nspecific weights to zero to achieve identity control. However, settings of\nremaining weight (e.g., Fixup uses random values to initialize non-zero\nweights) will affect the inductive bias that is achieved only by a zero weight,\nwhich may be harmful to training. Addressing this concern, we introduce fully\nidentical initialization (IDInit), a novel method that preserves identity in\nboth the main and sub-stem layers of residual networks. IDInit employs a padded\nidentity-like matrix to overcome rank constraints in non-square weight\nmatrices. Furthermore, we show the convergence problem of an identity matrix\ncan be solved by stochastic gradient descent. Additionally, we enhance the\nuniversality of IDInit by processing higher-order weights and addressing dead\nneuron problems. IDInit is a straightforward yet effective initialization\nmethod, with improved convergence, stability, and performance across various\nsettings, including large-scale datasets and deep models.\n","authors":["Yu Pan","Chaozheng Wang","Zekai Wu","Qifan Wang","Min Zhang","Zenglin Xu"],"pdf_url":"https://arxiv.org/pdf/2503.04626v1.pdf","comment":"Accepted in ICLR 2025"},{"id":"http://arxiv.org/abs/2410.05116v2","updated":"2025-03-06T17:11:55Z","published":"2024-10-07T15:12:01Z","title":"Human-Feedback Efficient Reinforcement Learning for Online Diffusion\n Model Finetuning","summary":" Controllable generation through Stable Diffusion (SD) fine-tuning aims to\nimprove fidelity, safety, and alignment with human guidance. Existing\nreinforcement learning from human feedback methods usually rely on predefined\nheuristic reward functions or pretrained reward models built on large-scale\ndatasets, limiting their applicability to scenarios where collecting such data\nis costly or difficult. To effectively and efficiently utilize human feedback,\nwe develop a framework, HERO, which leverages online human feedback collected\non the fly during model learning. Specifically, HERO features two key\nmechanisms: (1) Feedback-Aligned Representation Learning, an online training\nmethod that captures human feedback and provides informative learning signals\nfor fine-tuning, and (2) Feedback-Guided Image Generation, which involves\ngenerating images from SD's refined initialization samples, enabling faster\nconvergence towards the evaluator's intent. We demonstrate that HERO is 4x more\nefficient in online feedback for body part anomaly correction compared to the\nbest existing method. Additionally, experiments show that HERO can effectively\nhandle tasks like reasoning, counting, personalization, and reducing NSFW\ncontent with only 0.5K online feedback.\n","authors":["Ayano Hiranaka","Shang-Fu Chen","Chieh-Hsin Lai","Dongjun Kim","Naoki Murata","Takashi Shibuya","Wei-Hsiang Liao","Shao-Hua Sun","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2410.05116v2.pdf","comment":"Published in International Conference on Learning Representations\n (ICLR) 2025"},{"id":"http://arxiv.org/abs/2407.18125v3","updated":"2025-03-06T17:03:35Z","published":"2024-07-25T15:32:59Z","title":"Self-supervised pre-training with diffusion model for few-shot landmark\n detection in x-ray images","summary":" Deep neural networks have been extensively applied in the medical domain for\nvarious tasks, including image classification, segmentation, and landmark\ndetection. However, their application is often hindered by data scarcity, both\nin terms of available annotations and images. This study introduces a novel\napplication of denoising diffusion probabilistic models (DDPMs) to the landmark\ndetection task, specifically addressing the challenge of limited annotated data\nin x-ray imaging. Our key innovation lies in leveraging DDPMs for\nself-supervised pre-training in landmark detection, a previously unexplored\napproach in this domain. This method enables accurate landmark detection with\nminimal annotated training data (as few as 50 images), surpassing both ImageNet\nsupervised pre-training and traditional self-supervised techniques across three\npopular x-ray benchmark datasets. To our knowledge, this work represents the\nfirst application of diffusion models for self-supervised learning in landmark\ndetection, which may offer a valuable pre-training approach in few-shot\nregimes, for mitigating data scarcity.\n","authors":["Roberto Di Via","Francesca Odone","Vito Paolo Pastore"],"pdf_url":"https://arxiv.org/pdf/2407.18125v3.pdf","comment":"Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2503.04606v1","updated":"2025-03-06T16:53:14Z","published":"2025-03-06T16:53:14Z","title":"The Best of Both Worlds: Integrating Language Models and Diffusion\n Models for Video Generation","summary":" Recent advancements in text-to-video (T2V) generation have been driven by two\ncompeting paradigms: autoregressive language models and diffusion models.\nHowever, each paradigm has intrinsic limitations: language models struggle with\nvisual quality and error accumulation, while diffusion models lack semantic\nunderstanding and causal modeling. In this work, we propose LanDiff, a hybrid\nframework that synergizes the strengths of both paradigms through\ncoarse-to-fine generation. Our architecture introduces three key innovations:\n(1) a semantic tokenizer that compresses 3D visual features into compact 1D\ndiscrete representations through efficient semantic compression, achieving a\n$\\sim$14,000$\\times$ compression ratio; (2) a language model that generates\nsemantic tokens with high-level semantic relationships; (3) a streaming\ndiffusion model that refines coarse semantics into high-fidelity videos.\nExperiments show that LanDiff, a 5B model, achieves a score of 85.43 on the\nVBench T2V benchmark, surpassing the state-of-the-art open-source models\nHunyuan Video (13B) and other commercial models such as Sora, Keling, and\nHailuo. Furthermore, our model also achieves state-of-the-art performance in\nlong video generation, surpassing other open-source models in this field. Our\ndemo can be viewed at https://landiff.github.io/.\n","authors":["Aoxiong Yin","Kai Shen","Yichong Leng","Xu Tan","Xinyu Zhou","Juncheng Li","Siliang Tang"],"pdf_url":"https://arxiv.org/pdf/2503.04606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04598v1","updated":"2025-03-06T16:40:48Z","published":"2025-03-06T16:40:48Z","title":"HybridNorm: Towards Stable and Efficient Transformer Training via Hybrid\n Normalization","summary":" Transformers have become the de facto architecture for a wide range of\nmachine learning tasks, particularly in large language models (LLMs). Despite\ntheir remarkable performance, challenges remain in training deep transformer\nnetworks, especially regarding the location of layer normalization. While\nPre-Norm structures facilitate easier training due to their more prominent\nidentity path, they often yield suboptimal performance compared to Post-Norm.\nIn this paper, we propose $\\textbf{HybridNorm}$, a straightforward yet\neffective hybrid normalization strategy that integrates the advantages of both\nPre-Norm and Post-Norm approaches. Specifically, HybridNorm employs QKV\nnormalization within the attention mechanism and Post-Norm in the feed-forward\nnetwork (FFN) of each transformer block. This design not only stabilizes\ntraining but also enhances performance, particularly in the context of LLMs.\nComprehensive experiments in both dense and sparse architectures show that\nHybridNorm consistently outperforms both Pre-Norm and Post-Norm approaches,\nachieving state-of-the-art results across various benchmarks. These findings\nhighlight the potential of HybridNorm as a more stable and effective technique\nfor improving the training and performance of deep transformer models. %Code\nwill be made publicly available. Code is available at\nhttps://github.com/BryceZhuo/HybridNorm.\n","authors":["Zhijian Zhuo","Yutao Zeng","Ya Wang","Sijun Zhang","Jian Yang","Xiaoqing Li","Xun Zhou","Jinwen Ma"],"pdf_url":"https://arxiv.org/pdf/2503.04598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04596v1","updated":"2025-03-06T16:38:23Z","published":"2025-03-06T16:38:23Z","title":"The Next Frontier of LLM Applications: Open Ecosystems and Hardware\n Synergy","summary":" Large Language Model (LLM) applications, including LLM app stores and\nautonomous agents, are shaping the future of AI ecosystems. However, platform\nsilos, fragmented hardware integration, and the absence of standardized\ninterfaces limit scalability, interoperability, and resource efficiency. While\nLLM app stores democratize AI, their closed ecosystems restrict modular AI\nreuse and cross-platform portability. Meanwhile, agent-based frameworks offer\nflexibility but often lack seamless integration across diverse environments.\nThis paper envisions the future of LLM applications and proposes a three-layer\ndecoupled architecture grounded in software engineering principles such as\nlayered system design, service-oriented architectures, and hardware-software\nco-design. This architecture separates application logic, communication\nprotocols, and hardware execution, enhancing modularity, efficiency, and\ncross-platform compatibility. Beyond architecture, we highlight key security\nand privacy challenges for safe, scalable AI deployment and outline research\ndirections in software and security engineering. This vision aims to foster\nopen, secure, and interoperable LLM ecosystems, guiding future advancements in\nAI applications.\n","authors":["Xinyi Hou","Yanjie Zhao","Haoyu Wang"],"pdf_url":"https://arxiv.org/pdf/2503.04596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00053v3","updated":"2025-03-06T16:28:55Z","published":"2024-10-30T19:09:02Z","title":"ACC-Collab: An Actor-Critic Approach to Multi-Agent LLM Collaboration","summary":" Large language models (LLMs) have demonstrated a remarkable ability to serve\nas general-purpose tools for various language-based tasks. Recent works have\ndemonstrated that the efficacy of such models can be improved through iterative\ndialog between multiple models. While these paradigms show promise in improving\nmodel efficacy, most works in this area treat collaboration as an emergent\nbehavior, rather than a learned behavior. In doing so, current multi-agent\nframeworks rely on collaborative behaviors to have been sufficiently trained\ninto off-the-shelf models. To address this limitation, we propose ACC-Collab,\nan Actor-Critic based learning framework to produce a two-agent team (an\nactor-agent and a critic-agent) specialized in collaboration. We demonstrate\nthat ACC-Collab outperforms SotA multi-agent techniques on a wide array of\nbenchmarks.\n","authors":["Andrew Estornell","Jean-Francois Ton","Yuanshun Yao","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2411.00053v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17412v3","updated":"2025-03-06T16:22:22Z","published":"2024-05-27T17:57:12Z","title":"Towards One Model for Classical Dimensionality Reduction: A\n Probabilistic Perspective on UMAP and t-SNE","summary":" This paper shows that dimensionality reduction methods such as UMAP and\nt-SNE, can be approximately recast as MAP inference methods corresponding to a\nmodel introduced in ProbDR, that describes the graph Laplacian (an estimate of\nthe data precision matrix) using a Wishart distribution, with a mean given by a\nnon-linear covariance function evaluated on the latents. This interpretation\noffers deeper theoretical and semantic insights into such algorithms, by\nshowing that variances corresponding to these covariances are low (potentially\nmisspecified), and forging a connection to Gaussian process latent variable\nmodels by showing that well-known kernels can be used to describe covariances\nimplied by graph Laplacians. We also introduce tools with which similar\ndimensionality reduction methods can be studied.\n","authors":["Aditya Ravuri","Neil D. Lawrence"],"pdf_url":"https://arxiv.org/pdf/2405.17412v3.pdf","comment":"Updated preprint"},{"id":"http://arxiv.org/abs/2503.02972v2","updated":"2025-03-06T16:16:07Z","published":"2025-03-04T19:57:47Z","title":"LINGOLY-TOO: Disentangling Memorisation from Reasoning with Linguistic\n Templatisation and Orthographic Obfuscation","summary":" Assessing the reasoning capabilities of large language models (LLMs) is\nsusceptible to overestimation due to data exposure of evaluation benchmarks. We\nintroduce a framework for producing linguistic reasoning problems that reduces\nthe effect of memorisation in model performance estimates and apply this\nframework to develop LINGOLY-TOO, a challenging benchmark for linguistic\nreasoning. By developing orthographic templates, we dynamically obfuscate the\nwriting systems of real languages to generate numerousquestion variations.\nThese variations preserve the reasoning steps required for each solution while\nreducing the likelihood of specific problem instances appearing in model\ntraining data. Our experiments demonstrate that frontier models, including\nClaud 3.7 Sonnet, o1-preview and DeepSeek R1, struggle with advanced reasoning.\nOur analysis also shows that LLMs exhibit noticeable variance in accuracy\nacross permutations of the same problem, and on average perform better on\nquestions appearing in their original orthography. Our findings highlight the\nopaque nature of response generation in LLMs and provide evidence that prior\ndata exposure contributes to over estimating the reasoning capabilities of\nfrontier models.\n","authors":["Jude Khouja","Karolina Korgul","Simi Hellsten","Lingyi Yang","Vlad Neacs","Harry Mayne","Ryan Kearns","Andrew Bean","Adam Mahdi"],"pdf_url":"https://arxiv.org/pdf/2503.02972v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.17504v2","updated":"2025-03-06T16:14:45Z","published":"2025-02-21T19:22:10Z","title":"Protein Large Language Models: A Comprehensive Survey","summary":" Protein-specific large language models (Protein LLMs) are revolutionizing\nprotein science by enabling more efficient protein structure prediction,\nfunction annotation, and design. While existing surveys focus on specific\naspects or applications, this work provides the first comprehensive overview of\nProtein LLMs, covering their architectures, training datasets, evaluation\nmetrics, and diverse applications. Through a systematic analysis of over 100\narticles, we propose a structured taxonomy of state-of-the-art Protein LLMs,\nanalyze how they leverage large-scale protein sequence data for improved\naccuracy, and explore their potential in advancing protein engineering and\nbiomedical research. Additionally, we discuss key challenges and future\ndirections, positioning Protein LLMs as essential tools for scientific\ndiscovery in protein science. Resources are maintained at\nhttps://github.com/Yijia-Xiao/Protein-LLM-Survey.\n","authors":["Yijia Xiao","Wanjia Zhao","Junkai Zhang","Yiqiao Jin","Han Zhang","Zhicheng Ren","Renliang Sun","Haixin Wang","Guancheng Wan","Pan Lu","Xiao Luo","Yu Zhang","James Zou","Yizhou Sun","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2502.17504v2.pdf","comment":"24 pages, 4 figures, 5 tables"},{"id":"http://arxiv.org/abs/2503.01804v2","updated":"2025-03-06T16:07:43Z","published":"2025-03-03T18:33:46Z","title":"$\\texttt{SEM-CTRL}$: Semantically Controlled Decoding","summary":" Ensuring both syntactic and semantic correctness in Large Language Model\n(LLM) outputs remains a significant challenge, despite being critical for\nreal-world deployment. In this paper, we introduce $\\texttt{SEM-CTRL}$, a\nunified approach that enforces rich context-sensitive constraints and task- and\ninstance-specific semantics directly on an LLM decoder. Our approach integrates\ntoken-level MCTS, which is guided by specific syntactic and semantic\nconstraints. The constraints over the desired outputs are expressed using\nAnswer Set Grammars -- a logic-based formalism that generalizes\ncontext-sensitive grammars while incorporating background knowledge to\nrepresent task-specific semantics. We show that our approach guarantees correct\ncompletions for any off-the-shelf LLM without the need for fine-tuning. We\nevaluate $\\texttt{SEM-CTRL}$ on a range of tasks, including synthetic grammar\nsynthesis, combinatorial reasoning, and planning. Our results demonstrate that\n$\\texttt{SEM-CTRL}$ allows small pre-trained LLMs to efficiently outperform\nlarger variants and state-of-the-art reasoning models (e.g., o1-preview) while\nsimultaneously guaranteeing solution correctness.\n","authors":["Mohammad Albinhassan","Pranava Madhyastha","Alessandra Russo"],"pdf_url":"https://arxiv.org/pdf/2503.01804v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04569v1","updated":"2025-03-06T16:02:53Z","published":"2025-03-06T16:02:53Z","title":"ValuePilot: A Two-Phase Framework for Value-Driven Decision-Making","summary":" Despite recent advances in artificial intelligence (AI), it poses challenges\nto ensure personalized decision-making in tasks that are not considered in\ntraining datasets. To address this issue, we propose ValuePilot, a two-phase\nvalue-driven decision-making framework comprising a dataset generation toolkit\nDGT and a decision-making module DMM trained on the generated data. DGT is\ncapable of generating scenarios based on value dimensions and closely mirroring\nreal-world tasks, with automated filtering techniques and human curation to\nensure the validity of the dataset. In the generated dataset, DMM learns to\nrecognize the inherent values of scenarios, computes action feasibility and\nnavigates the trade-offs between multiple value dimensions to make personalized\ndecisions. Extensive experiments demonstrate that, given human value\npreferences, our DMM most closely aligns with human decisions, outperforming\nClaude-3.5-Sonnet, Gemini-2-flash, Llama-3.1-405b and GPT-4o. This research is\na preliminary exploration of value-driven decision-making. We hope it will\nstimulate interest in value-driven decision-making and personalized\ndecision-making within the community.\n","authors":["Yitong Luo","Hou Hei Lam","Ziang Chen","Zhenliang Zhang","Xue Feng"],"pdf_url":"https://arxiv.org/pdf/2503.04569v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04564v1","updated":"2025-03-06T15:53:37Z","published":"2025-03-06T15:53:37Z","title":"Fundamental Limits of Hierarchical Secure Aggregation with Cyclic User\n Association","summary":" Secure aggregation is motivated by federated learning (FL) where a cloud\nserver aims to compute an averaged model (i.e., weights of deep neural\nnetworks) of the locally-trained models of numerous clients, while adhering to\ndata security requirements. Hierarchical secure aggregation (HSA) extends this\nconcept to a three-layer network, where clustered users communicate with the\nserver through an intermediate layer of relays. In HSA, beyond conventional\nserver security, relay security is also enforced to ensure that the relays\nremain oblivious to the users' inputs (an abstraction of the local models in\nFL). Existing study on HSA assumes that each user is associated with only one\nrelay, limiting opportunities for coding across inter-cluster users to achieve\nefficient communication and key generation. In this paper, we consider HSA with\na cyclic association pattern where each user is connected to $B$ consecutive\nrelays in a wrap-around manner. We propose an efficient aggregation scheme\nwhich includes a message design for the inputs inspired by gradient coding-a\nwell-known technique for efficient communication in distributed computing-along\nwith a highly nontrivial security key design. We also derive novel converse\nbounds on the minimum achievable communication and key rates using\ninformation-theoretic arguments.\n","authors":["Xiang Zhang","Zhou Li","Kai Wan","Hua Sun","Mingyue Ji","Giuseppe Caire"],"pdf_url":"https://arxiv.org/pdf/2503.04564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.00153v2","updated":"2025-03-06T15:50:28Z","published":"2024-09-30T18:52:53Z","title":"Beyond Single Concept Vector: Modeling Concept Subspace in LLMs with\n Gaussian Distribution","summary":" Probing learned concepts in large language models (LLMs) is crucial for\nunderstanding how semantic knowledge is encoded internally. Training linear\nclassifiers on probing tasks is a principle approach to denote the vector of a\ncertain concept in the representation space. However, the single vector\nidentified for a concept varies with both data and training, making it less\nrobust and weakening its effectiveness in real-world applications. To address\nthis challenge, we propose an approach to approximate the subspace representing\na specific concept. Built on linear probing classifiers, we extend the concept\nvectors into Gaussian Concept Subspace (GCS). We demonstrate GCS's\neffectiveness through measuring its faithfulness and plausibility across\nmultiple LLMs with different sizes and architectures. Additionally, we use\nrepresentation intervention tasks to showcase its efficacy in real-world\napplications such as emotion steering. Experimental results indicate that GCS\nconcept vectors have the potential to balance steering performance and\nmaintaining the fluency in natural language generation tasks.\n","authors":["Haiyan Zhao","Heng Zhao","Bo Shen","Ali Payani","Fan Yang","Mengnan Du"],"pdf_url":"https://arxiv.org/pdf/2410.00153v2.pdf","comment":"Accepted by ICLR 2025"},{"id":"http://arxiv.org/abs/2503.04556v1","updated":"2025-03-06T15:47:19Z","published":"2025-03-06T15:47:19Z","title":"Compositional Causal Reasoning Evaluation in Language Models","summary":" Causal reasoning and compositional reasoning are two core aspirations in\ngenerative AI. Measuring the extent of these behaviors requires principled\nevaluation methods. We explore a unified perspective that considers both\nbehaviors simultaneously, termed compositional causal reasoning (CCR): the\nability to infer how causal measures compose and, equivalently, how causal\nquantities propagate through graphs. We instantiate a framework for the\nsystematic evaluation of CCR for the average treatment effect and the\nprobability of necessity and sufficiency. As proof of concept, we demonstrate\nthe design of CCR tasks for language models in the LLama, Phi, and GPT\nfamilies. On a math word problem, our framework revealed a range of\ntaxonomically distinct error patterns. Additionally, CCR errors increased with\nthe complexity of causal paths for all models except o1.\n","authors":["Jacqueline R. M. A. Maasch","Alihan Hüyük","Xinnuo Xu","Aditya V. Nori","Javier Gonzalez"],"pdf_url":"https://arxiv.org/pdf/2503.04556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18691v2","updated":"2025-03-06T15:47:01Z","published":"2024-07-26T12:16:53Z","title":"Graph Neural Networks for Virtual Sensing in Complex Systems: Addressing\n Heterogeneous Temporal Dynamics","summary":" Real-time condition monitoring is crucial for the reliable and efficient\noperation of complex systems. However, relying solely on physical sensors can\nbe limited due to their cost, placement constraints, or inability to directly\nmeasure certain critical parameters. Virtual sensing addresses these\nlimitations by leveraging readily available sensor data and system knowledge to\nestimate inaccessible parameters or infer system states. The increasing\ncomplexity of industrial systems necessitates deployments of sensors with\ndiverse modalities to provide a comprehensive understanding of system states.\nThese sensors capture data at varying frequencies to monitor both rapid and\nslowly varying system dynamics, as well as local and global state evolutions of\nthe systems. This leads to heterogeneous temporal dynamics, which, particularly\nunder varying operational end environmental conditions, pose a significant\nchallenge for accurate virtual sensing. To address this, we propose a\nHeterogeneous Temporal Graph Neural Network (HTGNN) framework. HTGNN explicitly\nmodels signals from diverse sensors and integrates operating conditions into\nthe model architecture. We evaluate HTGNN using two newly released datasets: a\nbearing dataset with diverse load conditions for bearing load prediction and a\nyear-long simulated dataset for predicting bridge live loads. Our results\ndemonstrate that HTGNN significantly outperforms established baseline methods\nin both tasks, particularly under highly varying operating conditions. These\nresults highlight HTGNN's potential as a robust and accurate virtual sensing\napproach for complex systems, paving the way for improved monitoring,\npredictive maintenance, and enhanced system performance. Our code and data are\navailable under https://github.com/EPFL-IMOS/htgnn.\n","authors":["Mengjie Zhao","Cees Taal","Stephan Baggerohr","Olga Fink"],"pdf_url":"https://arxiv.org/pdf/2407.18691v2.pdf","comment":"This paper extends our previous conference paper (Best Paper at\n European Conference of the PHM Society 2024,\n https://doi.org/10.36001/phme.2024.v8i1.3998). Accepted by Mechanical Systems\n and Signal Processing (MSSP)"},{"id":"http://arxiv.org/abs/2502.09990v2","updated":"2025-03-06T15:38:31Z","published":"2025-02-14T08:22:51Z","title":"X-Boundary: Establishing Exact Safety Boundary to Shield LLMs from\n Multi-Turn Jailbreaks without Compromising Usability","summary":" Despite the rapid development of safety alignment techniques for LLMs,\ndefending against multi-turn jailbreaks is still a challenging task. In this\npaper, we conduct a comprehensive comparison, revealing that some existing\ndefense methods can improve the robustness of LLMs against multi-turn\njailbreaks but compromise usability, i.e., reducing general capabilities or\ncausing the over-refusal problem. From the perspective of mechanism\ninterpretability of LLMs, we discover that these methods fail to establish a\nboundary that exactly distinguishes safe and harmful feature representations.\nTherefore, boundary-safe representations close to harmful representations are\ninevitably disrupted, leading to a decline in usability. To address this issue,\nwe propose X-Boundary to push harmful representations away from boundary-safe\nrepresentations and obtain an exact distinction boundary. In this way, harmful\nrepresentations can be precisely erased without disrupting safe ones.\nExperimental results show that X-Boundary achieves state-of-the-art defense\nperformance against multi-turn jailbreaks, while reducing the over-refusal rate\nby about 20% and maintaining nearly complete general capability. Furthermore,\nwe theoretically prove and empirically verify that X-Boundary can accelerate\nthe convergence process during training. Please see our code at:\nhttps://github.com/AI45Lab/X-Boundary.\n","authors":["Xiaoya Lu","Dongrui Liu","Yi Yu","Luxin Xu","Jing Shao"],"pdf_url":"https://arxiv.org/pdf/2502.09990v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.20984v2","updated":"2025-03-06T15:36:48Z","published":"2025-02-28T11:52:02Z","title":"UoR-NCL at SemEval-2025 Task 1: Using Generative LLMs and CLIP Models\n for Multilingual Multimodal Idiomaticity Representation","summary":" SemEval-2025 Task 1 focuses on ranking images based on their alignment with a\ngiven nominal compound that may carry idiomatic meaning in both English and\nBrazilian Portuguese. To address this challenge, this work uses generative\nlarge language models (LLMs) and multilingual CLIP models to enhance idiomatic\ncompound representations. LLMs generate idiomatic meanings for potentially\nidiomatic compounds, enriching their semantic interpretation. These meanings\nare then encoded using multilingual CLIP models, serving as representations for\nimage ranking. Contrastive learning and data augmentation techniques are\napplied to fine-tune these embeddings for improved performance. Experimental\nresults show that multimodal representations extracted through this method\noutperformed those based solely on the original nominal compounds. The\nfine-tuning approach shows promising outcomes but is less effective than using\nembeddings without fine-tuning. The source code used in this paper is available\nat https://github.com/tongwu17/SemEval-2025-Task1-UoR-NCL.\n","authors":["Thanet Markchom","Tong Wu","Liting Huang","Huizhi Liang"],"pdf_url":"https://arxiv.org/pdf/2502.20984v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04550v1","updated":"2025-03-06T15:36:06Z","published":"2025-03-06T15:36:06Z","title":"Benchmarking Reasoning Robustness in Large Language Models","summary":" Despite the recent success of large language models (LLMs) in reasoning such\nas DeepSeek, we for the first time identify a key dilemma in reasoning\nrobustness and generalization: significant performance degradation on novel or\nincomplete data, suggesting a reliance on memorized patterns rather than\nsystematic reasoning. Our closer examination reveals four key unique\nlimitations underlying this issue:(1) Positional bias--models favor earlier\nqueries in multi-query inputs but answering the wrong one in the latter (e.g.,\nGPT-4o's accuracy drops from 75.8 percent to 72.8 percent); (2) Instruction\nsensitivity--performance declines by 5.0 to 7.5 percent in the Qwen2.5 Series\nand by 5.0 percent in DeepSeek-V3 with auxiliary guidance; (3) Numerical\nfragility--value substitution sharply reduces accuracy (e.g., GPT-4o drops from\n97.5 percent to 82.5 percent, GPT-o1-mini drops from 97.5 percent to 92.5\npercent); and (4) Memory dependence--models resort to guesswork when missing\ncritical data. These findings further highlight the reliance on heuristic\nrecall over rigorous logical inference, demonstrating challenges in reasoning\nrobustness. To comprehensively investigate these robustness challenges, this\npaper introduces a novel benchmark, termed as Math-RoB, that exploits\nhallucinations triggered by missing information to expose reasoning gaps. This\nis achieved by an instruction-based approach to generate diverse datasets that\nclosely resemble training distributions, facilitating a holistic robustness\nassessment and advancing the development of more robust reasoning frameworks.\nBad character(s) in field Abstract.\n","authors":["Tong Yu","Yongcheng Jing","Xikun Zhang","Wentao Jiang","Wenjie Wu","Yingjie Wang","Wenbin Hu","Bo Du","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2503.04550v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00025v2","updated":"2025-03-06T15:29:41Z","published":"2024-02-28T15:19:33Z","title":"On the Challenges and Opportunities in Generative AI","summary":" The field of deep generative modeling has grown rapidly in the last few\nyears. With the availability of massive amounts of training data coupled with\nadvances in scalable unsupervised learning paradigms, recent large-scale\ngenerative models show tremendous promise in synthesizing high-resolution\nimages and text, as well as structured data such as videos and molecules.\nHowever, we argue that current large-scale generative AI models exhibit several\nfundamental shortcomings that hinder their widespread adoption across domains.\nIn this work, our objective is to identify these issues and highlight key\nunresolved challenges in modern generative AI paradigms that should be\naddressed to further enhance their capabilities, versatility, and reliability.\nBy identifying these challenges, we aim to provide researchers with insights\nfor exploring fruitful research directions, thus fostering the development of\nmore robust and accessible generative AI solutions.\n","authors":["Laura Manduchi","Kushagra Pandey","Clara Meister","Robert Bamler","Ryan Cotterell","Sina Däubener","Sophie Fellenz","Asja Fischer","Thomas Gärtner","Matthias Kirchler","Marius Kloft","Yingzhen Li","Christoph Lippert","Gerard de Melo","Eric Nalisnick","Björn Ommer","Rajesh Ranganath","Maja Rudolph","Karen Ullrich","Guy Van den Broeck","Julia E Vogt","Yixin Wang","Florian Wenzel","Frank Wood","Stephan Mandt","Vincent Fortuin"],"pdf_url":"https://arxiv.org/pdf/2403.00025v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04543v1","updated":"2025-03-06T15:29:13Z","published":"2025-03-06T15:29:13Z","title":"Keeping Yourself is Important in Downstream Tuning Multimodal Large\n Language Model","summary":" Multi-modal Large Language Models (MLLMs) integrate visual and linguistic\nreasoning to address complex tasks such as image captioning and visual question\nanswering. While MLLMs demonstrate remarkable versatility, MLLMs appears\nlimited performance on special applications. But tuning MLLMs for downstream\ntasks encounters two key challenges: Task-Expert Specialization, where\ndistribution shifts between pre-training and target datasets constrain target\nperformance, and Open-World Stabilization, where catastrophic forgetting erases\nthe model general knowledge. In this work, we systematically review recent\nadvancements in MLLM tuning methodologies, classifying them into three\nparadigms: (I) Selective Tuning, (II) Additive Tuning, and (III)\nReparameterization Tuning. Furthermore, we benchmark these tuning strategies\nacross popular MLLM architectures and diverse downstream tasks to establish\nstandardized evaluation analysis and systematic tuning principles. Finally, we\nhighlight several open challenges in this domain and propose future research\ndirections. To facilitate ongoing progress in this rapidly evolving field, we\nprovide a public repository that continuously tracks developments:\nhttps://github.com/WenkeHuang/Awesome-MLLM-Tuning.\n","authors":["Wenke Huang","Jian Liang","Xianda Guo","Yiyang Fang","Guancheng Wan","Xuankun Rong","Chi Wen","Zekun Shi","Qingyun Li","Didi Zhu","Yanbiao Ma","Ke Liang","Bin Yang","He Li","Jiawei Shao","Mang Ye","Bo Du"],"pdf_url":"https://arxiv.org/pdf/2503.04543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07180v5","updated":"2025-03-06T15:26:56Z","published":"2024-11-11T17:57:30Z","title":"Gumbel Counterfactual Generation From Language Models","summary":" Understanding and manipulating the causal generation mechanisms in language\nmodels is essential for controlling their behavior. Previous work has primarily\nrelied on techniques such as representation surgery -- e.g., model ablations or\nmanipulation of linear subspaces tied to specific concepts -- to\n\\emph{intervene} on these models. To understand the impact of interventions\nprecisely, it is useful to examine \\emph{counterfactuals} -- e.g., how a given\nsentence would have appeared had it been generated by the model following a\nspecific intervention. We highlight that counterfactual reasoning is\nconceptually distinct from interventions, as articulated in Pearl's causal\nhierarchy. Based on this observation, we propose a framework for generating\ntrue string counterfactuals by reformulating language models as a structural\nequation model using the Gumbel-max trick, which we called Gumbel\ncounterfactual generation. This reformulation allows us to model the joint\ndistribution over original strings and their counterfactuals resulting from the\nsame instantiation of the sampling noise. We develop an algorithm based on\nhindsight Gumbel sampling that allows us to infer the latent noise variables\nand generate counterfactuals of observed strings. Our experiments demonstrate\nthat the approach produces meaningful counterfactuals while at the same time\nshowing that commonly used intervention techniques have considerable undesired\nside effects.\n","authors":["Shauli Ravfogel","Anej Svete","Vésteinn Snæbjarnarson","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2411.07180v5.pdf","comment":"Accepted in ICLR 2025"},{"id":"http://arxiv.org/abs/2503.04530v1","updated":"2025-03-06T15:19:17Z","published":"2025-03-06T15:19:17Z","title":"SOLAR: Scalable Optimization of Large-scale Architecture for Reasoning","summary":" Large Language Models (LLMs) excel in reasoning but remain constrained by\ntheir Chain-of-Thought (CoT) approach, which struggles with complex tasks\nrequiring more nuanced topological reasoning. We introduce SOLAR, Scalable\nOptimization of Large-scale Architecture for Reasoning, a framework that\ndynamically optimizes various reasoning topologies to enhance accuracy and\nefficiency.\n Our Topological Annotation Generation (TAG) system automates topological\ndataset creation and segmentation, improving post-training and evaluation.\nAdditionally, we propose Topological-Scaling, a reward-driven framework that\naligns training and inference scaling, equipping LLMs with adaptive, task-aware\nreasoning.\n SOLAR achieves substantial gains on MATH and GSM8K: +5% accuracy with\nTopological Tuning, +9% with Topological Reward, and +10.02% with Hybrid\nScaling. It also reduces response length by over 5% for complex problems,\nlowering inference latency.\n To foster the reward system, we train a multi-task Topological Reward Model\n(M-TRM), which autonomously selects the best reasoning topology and answer in a\nsingle pass, eliminating the need for training and inference on multiple\nsingle-task TRMs (S-TRMs), thus reducing both training cost and inference\nlatency. In addition, in terms of performance, M-TRM surpasses all S-TRMs,\nimproving accuracy by +10% and rank correlation by +9%.\n To the best of our knowledge, SOLAR sets a new benchmark for scalable,\nhigh-precision LLM reasoning while introducing an automated annotation process\nand a dynamic reasoning topology competition mechanism.\n","authors":["Chen Li","Yinyi Luo","Anudeep Bolimera","Marios Savvides"],"pdf_url":"https://arxiv.org/pdf/2503.04530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04521v1","updated":"2025-03-06T15:08:31Z","published":"2025-03-06T15:08:31Z","title":"Dynamic Pricing for On-Demand DNN Inference in the Edge-AI Market","summary":" The convergence of edge computing and AI gives rise to Edge-AI, which enables\nthe deployment of real-time AI applications and services at the network edge.\nOne of the fundamental research issues in Edge-AI is edge inference\nacceleration, which aims to realize low-latency high-accuracy DNN inference\nservices by leveraging the fine-grained offloading of partitioned inference\ntasks from end devices to edge servers. However, existing research has yet to\nadopt a practical Edge-AI market perspective, which would systematically\nexplore the personalized inference needs of AI users (e.g., inference accuracy,\nlatency, and task complexity), the revenue incentives for AI service providers\nthat offer edge inference services, and multi-stakeholder governance within a\nmarket-oriented context. To bridge this gap, we propose an Auction-based Edge\nInference Pricing Mechanism (AERIA) for revenue maximization to tackle the\nmulti-dimensional optimization problem of DNN model partition, edge inference\npricing, and resource allocation. We investigate the multi-exit device-edge\nsynergistic inference scheme for on-demand DNN inference acceleration, and\nanalyse the auction dynamics amongst the AI service providers, AI users and\nedge infrastructure provider. Owing to the strategic mechanism design via\nrandomized consensus estimate and cost sharing techniques, the Edge-AI market\nattains several desirable properties, including competitiveness in revenue\nmaximization, incentive compatibility, and envy-freeness, which are crucial to\nmaintain the effectiveness, truthfulness, and fairness of our auction outcomes.\nThe extensive simulation experiments based on four representative DNN inference\nworkloads demonstrate that our AERIA mechanism significantly outperforms\nseveral state-of-the-art approaches in revenue maximization, demonstrating the\nefficacy of AERIA for on-demand DNN inference in the Edge-AI market.\n","authors":["Songyuan Li","Jia Hu","Geyong Min","Haojun Huang","Jiwei Huang"],"pdf_url":"https://arxiv.org/pdf/2503.04521v1.pdf","comment":"Index Terms: Edge-AI, DNN Inference Offloading, Resource Management,\n Dynamic Pricing, Auction Mechanism"},{"id":"http://arxiv.org/abs/2502.05874v2","updated":"2025-03-06T15:02:33Z","published":"2025-02-09T12:23:40Z","title":"MMGDreamer: Mixed-Modality Graph for Geometry-Controllable 3D Indoor\n Scene Generation","summary":" Controllable 3D scene generation has extensive applications in virtual\nreality and interior design, where the generated scenes should exhibit high\nlevels of realism and controllability in terms of geometry. Scene graphs\nprovide a suitable data representation that facilitates these applications.\nHowever, current graph-based methods for scene generation are constrained to\ntext-based inputs and exhibit insufficient adaptability to flexible user\ninputs, hindering the ability to precisely control object geometry. To address\nthis issue, we propose MMGDreamer, a dual-branch diffusion model for scene\ngeneration that incorporates a novel Mixed-Modality Graph, visual enhancement\nmodule, and relation predictor. The mixed-modality graph allows object nodes to\nintegrate textual and visual modalities, with optional relationships between\nnodes. It enhances adaptability to flexible user inputs and enables meticulous\ncontrol over the geometry of objects in the generated scenes. The visual\nenhancement module enriches the visual fidelity of text-only nodes by\nconstructing visual representations using text embeddings. Furthermore, our\nrelation predictor leverages node representations to infer absent relationships\nbetween nodes, resulting in more coherent scene layouts. Extensive experimental\nresults demonstrate that MMGDreamer exhibits superior control of object\ngeometry, achieving state-of-the-art scene generation performance. Project\npage: https://yangzhifeio.github.io/project/MMGDreamer.\n","authors":["Zhifei Yang","Keyang Lu","Chao Zhang","Jiaxing Qi","Hanqi Jiang","Ruifei Ma","Shenglin Yin","Yifan Xu","Mingzhe Xing","Zhen Xiao","Jieyi Long","Xiangde Liu","Guangyao Zhai"],"pdf_url":"https://arxiv.org/pdf/2502.05874v2.pdf","comment":"Accepted by AAAI 2025 Main Track"},{"id":"http://arxiv.org/abs/2503.04509v1","updated":"2025-03-06T14:55:25Z","published":"2025-03-06T14:55:25Z","title":"STX-Search: Explanation Search for Continuous Dynamic Spatio-Temporal\n Models","summary":" Recent improvements in the expressive power of spatio-temporal models have\nled to performance gains in many real-world applications, such as traffic\nforecasting and social network modelling. However, understanding the\npredictions from a model is crucial to ensure reliability and trustworthiness,\nparticularly for high-risk applications, such as healthcare and transport. Few\nexisting methods are able to generate explanations for models trained on\ncontinuous-time dynamic graph data and, of these, the computational complexity\nand lack of suitable explanation objectives pose challenges. In this paper, we\npropose $\\textbf{S}$patio-$\\textbf{T}$emporal E$\\textbf{X}$planation\n$\\textbf{Search}$ (STX-Search), a novel method for generating instance-level\nexplanations that is applicable to static and dynamic temporal graph\nstructures. We introduce a novel search strategy and objective function, to\nfind explanations that are highly faithful and interpretable. When compared\nwith existing methods, STX-Search produces explanations of higher fidelity\nwhilst optimising explanation size to maintain interpretability.\n","authors":["Saif Anwar","Nathan Griffiths","Thomas Popham","Abhir Bhalerao"],"pdf_url":"https://arxiv.org/pdf/2503.04509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04506v1","updated":"2025-03-06T14:53:37Z","published":"2025-03-06T14:53:37Z","title":"Multi-modal Summarization in Model-Based Engineering: Automotive\n Software Development Case Study","summary":" Multimodal summarization integrating information from diverse data modalities\npresents a promising solution to aid the understanding of information within\nvarious processes. However, the application and advantages of multimodal\nsummarization have not received much attention in model-based engineering\n(MBE), where it has become a cornerstone in the design and development of\ncomplex systems, leveraging formal models to improve understanding, validation\nand automation throughout the engineering lifecycle. UML and EMF diagrams in\nmodel-based engineering contain a large amount of multimodal information and\nintricate relational data. Hence, our study explores the application of\nmultimodal large language models within the domain of model-based engineering\nto evaluate their capacity for understanding and identifying relationships,\nfeatures, and functionalities embedded in UML and EMF diagrams. We aim to\ndemonstrate the transformative potential benefits and limitations of multimodal\nsummarization in improving productivity and accuracy in MBE practices. The\nproposed approach is evaluated within the context of automotive software\ndevelopment, while many promising state-of-art models were taken into account.\n","authors":["Nenad Petrovic","Yurui Zhang","Moaad Maaroufi","Kuo-Yi Chao","Lukasz Mazur","Fengjunjie Pan","Vahid Zolfaghari","Alois Knoll"],"pdf_url":"https://arxiv.org/pdf/2503.04506v1.pdf","comment":"Conference paper accepted for IntelliSys2025"},{"id":"http://arxiv.org/abs/2503.04502v1","updated":"2025-03-06T14:50:29Z","published":"2025-03-06T14:50:29Z","title":"Interpretable Transformation and Analysis of Timelines through Learning\n via Surprisability","summary":" The analysis of high-dimensional timeline data and the identification of\noutliers and anomalies is critical across diverse domains, including sensor\nreadings, biological and medical data, historical records, and global\nstatistics. However, conventional analysis techniques often struggle with\nchallenges such as high dimensionality, complex distributions, and sparsity.\nThese limitations hinder the ability to extract meaningful insights from\ncomplex temporal datasets, making it difficult to identify trending features,\noutliers, and anomalies effectively. Inspired by surprisability -- a cognitive\nscience concept describing how humans instinctively focus on unexpected\ndeviations - we propose Learning via Surprisability (LvS), a novel approach for\ntransforming high-dimensional timeline data. LvS quantifies and prioritizes\nanomalies in time-series data by formalizing deviations from expected behavior.\nLvS bridges cognitive theories of attention with computational methods,\nenabling the detection of anomalies and shifts in a way that preserves critical\ncontext, offering a new lens for interpreting complex datasets. We demonstrate\nthe usefulness of LvS on three high-dimensional timeline use cases: a time\nseries of sensor data, a global dataset of mortality causes over multiple\nyears, and a textual corpus containing over two centuries of State of the Union\nAddresses by U.S. presidents. Our results show that the LvS transformation\nenables efficient and interpretable identification of outliers, anomalies, and\nthe most variable features along the timeline.\n","authors":["Osnat Mokryn","Teddy Lazebnik","Hagit Ben Shoshan"],"pdf_url":"https://arxiv.org/pdf/2503.04502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04500v1","updated":"2025-03-06T14:49:28Z","published":"2025-03-06T14:49:28Z","title":"ReynoldsFlow: Exquisite Flow Estimation via Reynolds Transport Theorem","summary":" Optical flow is a fundamental technique for motion estimation, widely applied\nin video stabilization, interpolation, and object tracking. Recent advancements\nin artificial intelligence (AI) have enabled deep learning models to leverage\noptical flow as an important feature for motion analysis. However, traditional\noptical flow methods rely on restrictive assumptions, such as brightness\nconstancy and slow motion constraints, limiting their effectiveness in complex\nscenes. Deep learning-based approaches require extensive training on large\ndomain-specific datasets, making them computationally demanding. Furthermore,\noptical flow is typically visualized in the HSV color space, which introduces\nnonlinear distortions when converted to RGB and is highly sensitive to noise,\ndegrading motion representation accuracy. These limitations inherently\nconstrain the performance of downstream models, potentially hindering object\ntracking and motion analysis tasks. To address these challenges, we propose\nReynolds flow, a novel training-free flow estimation inspired by the Reynolds\ntransport theorem, offering a principled approach to modeling complex motion\ndynamics. Beyond the conventional HSV-based visualization, denoted\nReynoldsFlow, we introduce an alternative representation, ReynoldsFlow+,\ndesigned to improve flow visualization. We evaluate ReynoldsFlow and\nReynoldsFlow+ across three video-based benchmarks: tiny object detection on\nUAVDB, infrared object detection on Anti-UAV, and pose estimation on GolfDB.\nExperimental results demonstrate that networks trained with ReynoldsFlow+\nachieve state-of-the-art (SOTA) performance, exhibiting improved robustness and\nefficiency across all tasks.\n","authors":["Yu-Hsi Chen","Chin-Tien Wu"],"pdf_url":"https://arxiv.org/pdf/2503.04500v1.pdf","comment":"10 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2503.02012v2","updated":"2025-03-06T14:32:23Z","published":"2025-03-03T19:41:22Z","title":"Pretrained Embeddings as a Behavior Specification Mechanism","summary":" We propose an approach to formally specifying the behavioral properties of\nsystems that rely on a perception model for interactions with the physical\nworld. The key idea is to introduce embeddings -- mathematical representations\nof a real-world concept -- as a first-class construct in a specification\nlanguage, where properties are expressed in terms of distances between a pair\nof ideal and observed embeddings. To realize this approach, we propose a new\ntype of temporal logic called Embedding Temporal Logic (ETL), and describe how\nit can be used to express a wider range of properties about AI-enabled systems\nthan previously possible. We demonstrate the applicability of ETL through a\npreliminary evaluation involving planning tasks in robots that are driven by\nfoundation models; the results are promising, showing that embedding-based\nspecifications can be used to steer a system towards desirable behaviors.\n","authors":["Parv Kapoor","Abigail Hammer","Ashish Kapoor","Karen Leung","Eunsuk Kang"],"pdf_url":"https://arxiv.org/pdf/2503.02012v2.pdf","comment":"18 pages, 6 figures"},{"id":"http://arxiv.org/abs/2503.04482v1","updated":"2025-03-06T14:30:55Z","published":"2025-03-06T14:30:55Z","title":"Generalized Interpolating Discrete Diffusion","summary":" While state-of-the-art language models achieve impressive results through\nnext-token prediction, they have inherent limitations such as the inability to\nrevise already generated tokens. This has prompted exploration of alternative\napproaches such as discrete diffusion. However, masked diffusion, which has\nemerged as a popular choice due to its simplicity and effectiveness,\nreintroduces this inability to revise words. To overcome this, we generalize\nmasked diffusion and derive the theoretical backbone of a family of general\ninterpolating discrete diffusion (GIDD) processes offering greater flexibility\nin the design of the noising processes. Leveraging a novel diffusion ELBO, we\nachieve compute-matched state-of-the-art performance in diffusion language\nmodeling. Exploiting GIDD's flexibility, we explore a hybrid approach combining\nmasking and uniform noise, leading to improved sample quality and unlocking the\nability for the model to correct its own mistakes, an area where autoregressive\nmodels notoriously have struggled. Our code and models are open-source:\nhttps://github.com/dvruette/gidd/\n","authors":["Dimitri von Rütte","Janis Fluri","Yuhui Ding","Antonio Orvieto","Bernhard Schölkopf","Thomas Hofmann"],"pdf_url":"https://arxiv.org/pdf/2503.04482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04479v1","updated":"2025-03-06T14:29:52Z","published":"2025-03-06T14:29:52Z","title":"ToolFuzz -- Automated Agent Tool Testing","summary":" Large Language Model (LLM) Agents leverage the advanced reasoning\ncapabilities of LLMs in real-world applications. To interface with an\nenvironment, these agents often rely on tools, such as web search or database\nAPIs. As the agent provides the LLM with tool documentation along the user\nquery, the completeness and correctness of this documentation is critical.\nHowever, tool documentation is often over-, under-, or ill-specified, impeding\nthe agent's accuracy. Standard software testing approaches struggle to identify\nthese errors as they are expressed in natural language. Thus, despite its\nimportance, there currently exists no automated method to test the tool\ndocumentation for agents. To address this issue, we present ToolFuzz, the first\nmethod for automated testing of tool documentations. ToolFuzz is designed to\ndiscover two types of errors: (1) user queries leading to tool runtime errors\nand (2) user queries that lead to incorrect agent responses. ToolFuzz can\ngenerate a large and diverse set of natural inputs, effectively finding tool\ndescription errors at a low false positive rate. Further, we present two\nstraightforward prompt-engineering approaches. We evaluate all three tool\ntesting approaches on 32 common LangChain tools and 35 newly created custom\ntools and 2 novel benchmarks to further strengthen the assessment. We find that\nmany publicly available tools suffer from underspecification. Specifically, we\nshow that ToolFuzz identifies 20x more erroneous inputs compared to the\nprompt-engineering approaches, making it a key component for building reliable\nAI agents.\n","authors":["Ivan Milev","Mislav Balunović","Maximilian Baader","Martin Vechev"],"pdf_url":"https://arxiv.org/pdf/2503.04479v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03606v2","updated":"2025-03-06T14:28:36Z","published":"2025-03-05T15:42:37Z","title":"Decoupled Recommender Systems: Exploring Alternative Recommender\n Ecosystem Designs","summary":" Recommender ecosystems are an emerging subject of research. Such research\nexamines how the characteristics of algorithms, recommendation consumers, and\nitem providers influence system dynamics and long-term outcomes. One\narchitectural possibility that has not yet been widely explored in this line of\nresearch is the consequences of a configuration in which recommendation\nalgorithms are decoupled from the platforms they serve. This is sometimes\ncalled \"the friendly neighborhood algorithm store\" or \"middleware\" model. We\nare particularly interested in how such architectures might offer a range of\ndifferent distributions of utility across consumers, providers, and\nrecommendation platforms. In this paper, we create a model of a recommendation\necosystem that incorporates algorithm choice and examine the outcomes of such a\ndesign.\n","authors":["Anas Buhayh","Elizabeth McKinnie","Robin Burke"],"pdf_url":"https://arxiv.org/pdf/2503.03606v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.13524v4","updated":"2025-03-06T14:27:12Z","published":"2025-02-19T08:21:59Z","title":"MobileViM: A Light-weight and Dimension-independent Vision Mamba for 3D\n Medical Image Analysis","summary":" Efficient evaluation of three-dimensional (3D) medical images is crucial for\ndiagnostic and therapeutic practices in healthcare. Recent years have seen a\nsubstantial uptake in applying deep learning and computer vision to analyse and\ninterpret medical images. Traditional approaches, such as convolutional neural\nnetworks (CNNs) and vision transformers (ViTs), face significant computational\nchallenges, prompting the need for architectural advancements. Recent efforts\nhave led to the introduction of novel architectures like the ``Mamba'' model as\nalternative solutions to traditional CNNs or ViTs. The Mamba model excels in\nthe linear processing of one-dimensional data with low computational demands.\nHowever, Mamba's potential for 3D medical image analysis remains underexplored\nand could face significant computational challenges as the dimension increases.\nThis manuscript presents MobileViM, a streamlined architecture for efficient\nsegmentation of 3D medical images. In the MobileViM network, we invent a new\ndimension-independent mechanism and a dual-direction traversing approach to\nincorporate with a vision-Mamba-based framework. MobileViM also features a\ncross-scale bridging technique to improve efficiency and accuracy across\nvarious medical imaging modalities. With these enhancements, MobileViM achieves\nsegmentation speeds exceeding 90 frames per second (FPS) on a single graphics\nprocessing unit (i.e., NVIDIA RTX 4090). This performance is over 24 FPS faster\nthan the state-of-the-art deep learning models for processing 3D images with\nthe same computational resources. In addition, experimental evaluations\ndemonstrate that MobileViM delivers superior performance, with Dice similarity\nscores reaching 92.72%, 86.69%, 80.46%, and 77.43% for PENGWIN, BraTS2024,\nATLAS, and Toothfairy2 datasets, respectively, which significantly surpasses\nexisting models.\n","authors":["Wei Dai","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2502.13524v4.pdf","comment":"The corresponding author disagrees with the manuscript submitted to\n arXiv"},{"id":"http://arxiv.org/abs/2503.04472v1","updated":"2025-03-06T14:23:06Z","published":"2025-03-06T14:23:06Z","title":"DAST: Difficulty-Adaptive Slow-Thinking for Large Reasoning Models","summary":" Recent advancements in slow-thinking reasoning models have shown exceptional\nperformance in complex reasoning tasks. However, these models often exhibit\noverthinking-generating redundant reasoning steps for simple problems, leading\nto excessive computational resource usage. While current mitigation strategies\nuniformly reduce reasoning tokens, they risk degrading performance on\nchallenging tasks that require extended reasoning. This paper introduces\nDifficulty-Adaptive Slow-Thinking (DAST), a novel framework that enables models\nto autonomously adjust the length of Chain-of-Thought(CoT) based on problem\ndifficulty. We first propose a Token Length Budget (TLB) metric to quantify\ndifficulty, then leveraging length-aware reward shaping and length preference\noptimization to implement DAST. DAST penalizes overlong responses for simple\ntasks while incentivizing sufficient reasoning for complex problems.\nExperiments on diverse datasets and model scales demonstrate that DAST\neffectively mitigates overthinking (reducing token usage by over 30\\% on\naverage) while preserving reasoning accuracy on complex problems.\n","authors":["Yi Shen","Jian Zhang","Jieyun Huang","Shuming Shi","Wenjing Zhang","Jiangze Yan","Ning Wang","Kai Wang","Shiguo Lian"],"pdf_url":"https://arxiv.org/pdf/2503.04472v1.pdf","comment":"working in progress"},{"id":"http://arxiv.org/abs/2503.04457v1","updated":"2025-03-06T14:11:00Z","published":"2025-03-06T14:11:00Z","title":"TPC: Cross-Temporal Prediction Connection for Vision-Language Model\n Hallucination Reduction","summary":" Vision-language models (VLMs) have achieved remarkable advancements,\ncapitalizing on the impressive capabilities of large language models (LLMs)\nacross diverse tasks. Despite this, a critical challenge known as hallucination\noccurs when models overconfidently describe objects or attributes absent from\nthe image, a problem exacerbated by the tendency of VLMs to rely on linguistic\npriors. This limitation reduces model reliability in high-stakes applications.\nIn this work, we have observed the characteristic of logits' continuity\nconsistency enhancement and introduced a straightforward and efficient method,\nCross-Temporal Prediction Connection (TPC), designed to enhance the semantic\nconsistency of logits by connecting them temporally across timesteps. TPC\namplifies information flow and improves coherence, effectively reducing\nhallucination. Extensive experiments show that TPC surpasses existing\nrepresentatives, delivering superior performance in both accuracy and\nefficiency while maintaining robustness in open-ended text generation tasks.\n","authors":["Chao Wang","Weiwei Fu","Yang Zhou"],"pdf_url":"https://arxiv.org/pdf/2503.04457v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.13728v2","updated":"2025-03-06T14:07:57Z","published":"2025-02-19T13:54:44Z","title":"Secure Federated Data Distillation","summary":" Dataset Distillation (DD) is a powerful technique for reducing large datasets\ninto compact, representative synthetic datasets, accelerating Machine Learning\ntraining. However, traditional DD methods operate in a centralized manner,\nwhich poses significant privacy threats and reduces its applicability. To\nmitigate these risks, we propose a Secure Federated Data Distillation (SFDD)\nframework to decentralize the distillation process while preserving privacy.\nUnlike existing Federated Distillation techniques that focus on training global\nmodels with distilled knowledge, our approach aims to produce a distilled\ndataset without exposing local contributions. We leverage the\ngradient-matching-based distillation method, adapting it for a distributed\nsetting where clients contribute to the distillation process without sharing\nraw data. The central aggregator iteratively refines a synthetic dataset by\nintegrating client-side updates while ensuring data confidentiality. To make\nour approach resilient to inference attacks perpetrated by the server that\ncould exploit gradient updates to reconstruct private data, we create an\noptimized Local Differential Privacy approach, called LDPO-RLD. Furthermore, we\nassess the framework's resilience against malicious clients executing backdoor\nattacks (such as Doorping) and demonstrate robustness under the assumption of a\nsufficient number of participating clients. Our experimental results\ndemonstrate the effectiveness of SFDD and that the proposed defense concretely\nmitigates the identified vulnerabilities, with minimal impact on the\nperformance of the distilled dataset. By addressing the interplay between\nprivacy and federation in dataset distillation, this work advances the field of\nprivacy-preserving Machine Learning making our SFDD framework a viable solution\nfor sensitive data-sharing applications.\n","authors":["Marco Arazzi","Mert Cihangiroglu","Serena Nicolazzo","Antonino Nocera"],"pdf_url":"https://arxiv.org/pdf/2502.13728v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04451v1","updated":"2025-03-06T14:06:20Z","published":"2025-03-06T14:06:20Z","title":"Privacy Preserving and Robust Aggregation for Cross-Silo Federated\n Learning in Non-IID Settings","summary":" Federated Averaging remains the most widely used aggregation strategy in\nfederated learning due to its simplicity and scalability. However, its\nperformance degrades significantly in non-IID data settings, where client\ndistributions are highly imbalanced or skewed. Additionally, it relies on\nclients transmitting metadata, specifically the number of training samples,\nwhich introduces privacy risks and may conflict with regulatory frameworks like\nthe European GDPR. In this paper, we propose a novel aggregation strategy that\naddresses these challenges by introducing class-aware gradient masking. Unlike\ntraditional approaches, our method relies solely on gradient updates,\neliminating the need for any additional client metadata, thereby enhancing\nprivacy protection. Furthermore, our approach validates and dynamically weights\nclient contributions based on class-specific importance, ensuring robustness\nagainst non-IID distributions, convergence prevention, and backdoor attacks.\nExtensive experiments on benchmark datasets demonstrate that our method not\nonly outperforms FedAvg and other widely accepted aggregation strategies in\nnon-IID settings but also preserves model integrity in adversarial scenarios.\nOur results establish the effectiveness of gradient masking as a practical and\nsecure solution for federated learning.\n","authors":["Marco Arazzi","Mert Cihangiroglu","Antonino Nocera"],"pdf_url":"https://arxiv.org/pdf/2503.04451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.08010v2","updated":"2025-03-06T14:01:48Z","published":"2024-02-12T19:18:50Z","title":"Which Frequencies do CNNs Need? Emergent Bottleneck Structure in Feature\n Learning","summary":" We describe the emergence of a Convolution Bottleneck (CBN) structure in\nCNNs, where the network uses its first few layers to transform the input\nrepresentation into a representation that is supported only along a few\nfrequencies and channels, before using the last few layers to map back to the\noutputs. We define the CBN rank, which describes the number and type of\nfrequencies that are kept inside the bottleneck, and partially prove that the\nparameter norm required to represent a function $f$ scales as depth times the\nCBN rank $f$. We also show that the parameter norm depends at next order on the\nregularity of $f$. We show that any network with almost optimal parameter norm\nwill exhibit a CBN structure in both the weights and - under the assumption\nthat the network is stable under large learning rate - the activations, which\nmotivates the common practice of down-sampling; and we verify that the CBN\nresults still hold with down-sampling. Finally we use the CBN structure to\ninterpret the functions learned by CNNs on a number of tasks.\n","authors":["Yuxiao Wen","Arthur Jacot"],"pdf_url":"https://arxiv.org/pdf/2402.08010v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13959v2","updated":"2025-03-06T13:51:24Z","published":"2025-01-21T06:32:25Z","title":"Assisting Mathematical Formalization with A Learning-based Premise\n Retriever","summary":" Premise selection is a crucial yet challenging step in mathematical\nformalization, especially for users with limited experience. Due to the lack of\navailable formalization projects, existing approaches that leverage language\nmodels often suffer from data scarcity. In this work, we introduce an\ninnovative method for training a premise retriever to support the formalization\nof mathematics. Our approach employs a BERT model to embed proof states and\npremises into a shared latent space. The retrieval model is trained within a\ncontrastive learning framework and incorporates a domain-specific tokenizer\nalong with a fine-grained similarity computation method. Experimental results\nshow that our model is highly competitive compared to existing baselines,\nachieving strong performance while requiring fewer computational resources.\nPerformance is further enhanced through the integration of a re-ranking module.\nTo streamline the formalization process, we will release a search engine that\nenables users to query Mathlib theorems directly using proof states,\nsignificantly improving accessibility and efficiency. Codes are available at\nhttps://github.com/ruc-ai4math/Premise-Retrieval.\n","authors":["Yicheng Tao","Haotian Liu","Shanwen Wang","Hongteng Xu"],"pdf_url":"https://arxiv.org/pdf/2501.13959v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17573v2","updated":"2025-03-06T13:47:53Z","published":"2024-05-27T18:15:05Z","title":"Hamiltonian Mechanics of Feature Learning: Bottleneck Structure in Leaky\n ResNets","summary":" We study Leaky ResNets, which interpolate between ResNets and Fully-Connected\nnets depending on an 'effective depth' hyper-parameter $\\tilde{L}$. In the\ninfinite depth limit, we study 'representation geodesics' $A_{p}$: continuous\npaths in representation space (similar to NeuralODEs) from input $p=0$ to\noutput $p=1$ that minimize the parameter norm of the network. We give a\nLagrangian and Hamiltonian reformulation, which highlight the importance of two\nterms: a kinetic energy which favors small layer derivatives\n$\\partial_{p}A_{p}$ and a potential energy that favors low-dimensional\nrepresentations, as measured by the 'Cost of Identity'. The balance between\nthese two forces offers an intuitive understanding of feature learning in\nResNets. We leverage this intuition to explain the emergence of a bottleneck\nstructure, as observed in previous work: for large $\\tilde{L}$ the potential\nenergy dominates and leads to a separation of timescales, where the\nrepresentation jumps rapidly from the high dimensional inputs to a\nlow-dimensional representation, move slowly inside the space of low-dimensional\nrepresentations, before jumping back to the potentially high-dimensional\noutputs. Inspired by this phenomenon, we train with an adaptive layer step-size\nto adapt to the separation of timescales.\n","authors":["Arthur Jacot","Alexandre Kaiser"],"pdf_url":"https://arxiv.org/pdf/2405.17573v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.05664v2","updated":"2025-03-06T13:40:09Z","published":"2024-07-08T06:59:29Z","title":"How DNNs break the Curse of Dimensionality: Compositionality and\n Symmetry Learning","summary":" We show that deep neural networks (DNNs) can efficiently learn any\ncomposition of functions with bounded $F_{1}$-norm, which allows DNNs to break\nthe curse of dimensionality in ways that shallow networks cannot. More\nspecifically, we derive a generalization bound that combines a covering number\nargument for compositionality, and the $F_{1}$-norm (or the related Barron\nnorm) for large width adaptivity. We show that the global minimizer of the\nregularized loss of DNNs can fit for example the composition of two functions\n$f^{*}=h\\circ g$ from a small number of observations, assuming $g$ is\nsmooth/regular and reduces the dimensionality (e.g. $g$ could be the quotient\nmap of the symmetries of $f^{*}$), so that $h$ can be learned in spite of its\nlow regularity. The measures of regularity we consider is the Sobolev norm with\ndifferent levels of differentiability, which is well adapted to the $F_{1}$\nnorm. We compute scaling laws empirically and observe phase transitions\ndepending on whether $g$ or $h$ is harder to learn, as predicted by our theory.\n","authors":["Arthur Jacot","Seok Hoan Choi","Yuxiao Wen"],"pdf_url":"https://arxiv.org/pdf/2407.05664v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.12261v3","updated":"2025-03-06T13:39:32Z","published":"2024-10-16T05:58:55Z","title":"CATCH: Channel-Aware multivariate Time Series Anomaly Detection via\n Frequency Patching","summary":" Anomaly detection in multivariate time series is challenging as heterogeneous\nsubsequence anomalies may occur. Reconstruction-based methods, which focus on\nlearning normal patterns in the frequency domain to detect diverse abnormal\nsubsequences, achieve promising results, while still falling short on capturing\nfine-grained frequency characteristics and channel correlations. To contend\nwith the limitations, we introduce CATCH, a framework based on frequency\npatching. We propose to patchify the frequency domain into frequency bands,\nwhich enhances its ability to capture fine-grained frequency characteristics.\nTo perceive appropriate channel correlations, we propose a Channel Fusion\nModule (CFM), which features a patch-wise mask generator and a masked-attention\nmechanism. Driven by a bi-level multi-objective optimization algorithm, the CFM\nis encouraged to iteratively discover appropriate patch-wise channel\ncorrelations, and to cluster relevant channels while isolating adverse effects\nfrom irrelevant channels. Extensive experiments on 10 real-world datasets and\n12 synthetic datasets demonstrate that CATCH achieves state-of-the-art\nperformance. We make our code and datasets available at\nhttps://github.com/decisionintelligence/CATCH.\n","authors":["Xingjian Wu","Xiangfei Qiu","Zhengyu Li","Yihang Wang","Jilin Hu","Chenjuan Guo","Hui Xiong","Bin Yang"],"pdf_url":"https://arxiv.org/pdf/2410.12261v3.pdf","comment":"Accepted by ICLR 2025"},{"id":"http://arxiv.org/abs/2503.04429v1","updated":"2025-03-06T13:38:44Z","published":"2025-03-06T13:38:44Z","title":"Activation Space Interventions Can Be Transferred Between Large Language\n Models","summary":" The study of representation universality in AI models reveals growing\nconvergence across domains, modalities, and architectures. However, the\npractical applications of representation universality remain largely\nunexplored. We bridge this gap by demonstrating that safety interventions can\nbe transferred between models through learned mappings of their shared\nactivation spaces. We demonstrate this approach on two well-established AI\nsafety tasks: backdoor removal and refusal of harmful prompts, showing\nsuccessful transfer of steering vectors that alter the models' outputs in a\npredictable way. Additionally, we propose a new task, \\textit{corrupted\ncapabilities}, where models are fine-tuned to embed knowledge tied to a\nbackdoor. This tests their ability to separate useful skills from backdoors,\nreflecting real-world challenges. Extensive experiments across Llama, Qwen and\nGemma model families show that our method enables using smaller models to\nefficiently align larger ones. Furthermore, we demonstrate that autoencoder\nmappings between base and fine-tuned models can serve as reliable ``lightweight\nsafety switches\", allowing dynamic toggling between model behaviors.\n","authors":["Narmeen Oozeer","Dhruv Nathawani","Nirmalendu Prakash","Michael Lan","Abir Harrasse","Amirali Abdullah"],"pdf_url":"https://arxiv.org/pdf/2503.04429v1.pdf","comment":"68 pages"},{"id":"http://arxiv.org/abs/2503.04422v1","updated":"2025-03-06T13:31:16Z","published":"2025-03-06T13:31:16Z","title":"PDX: A Data Layout for Vector Similarity Search","summary":" We propose Partition Dimensions Across (PDX), a data layout for vectors\n(e.g., embeddings) that, similar to PAX [6], stores multiple vectors in one\nblock, using a vertical layout for the dimensions (Figure 1). PDX accelerates\nexact and approximate similarity search thanks to its dimension-by-dimension\nsearch strategy that operates on multiple-vectors-at-a-time in tight loops. It\nbeats SIMD-optimized distance kernels on standard horizontal vector storage\n(avg 40% faster), only relying on scalar code that gets auto-vectorized. We\ncombined the PDX layout with recent dimension-pruning algorithms ADSampling\n[19] and BSA [52] that accelerate approximate vector search. We found that\nthese algorithms on the horizontal vector layout can lose to SIMD-optimized\nlinear scans, even if they are SIMD-optimized. However, when used on PDX, their\nbenefit is restored to 2-7x. We find that search on PDX is especially fast if a\nlimited number of dimensions has to be scanned fully, which is what the\ndimension-pruning approaches do. We finally introduce PDX-BOND, an even more\nflexible dimension-pruning strategy, with good performance on exact search and\nreasonable performance on approximate search. Unlike previous pruning\nalgorithms, it can work on vector data \"as-is\" without preprocessing; making it\nattractive for vector databases with frequent updates.\n","authors":["Leonardo Kuffo","Elena Krippner","Peter Boncz"],"pdf_url":"https://arxiv.org/pdf/2503.04422v1.pdf","comment":"To be published in Proceedings of The 2025 International Conference\n on Management of Data (SIGMOD '25). For associated code, see\n https://github.com/cwida/PDX"},{"id":"http://arxiv.org/abs/2311.07978v4","updated":"2025-03-06T13:29:24Z","published":"2023-11-14T08:10:14Z","title":"AfroBench: How Good are Large Language Models on African Languages?","summary":" Large-scale multilingual evaluations, such as MEGA, often include only a\nhandful of African languages due to the scarcity of high-quality evaluation\ndata and the limited discoverability of existing African datasets. This lack of\nrepresentation hinders comprehensive LLM evaluation across a diverse range of\nlanguages and tasks. To address these challenges, we introduce AfroBench -- a\nmulti-task benchmark for evaluating the performance of LLMs across 64 African\nlanguages, 15 tasks and 22 datasets. AfroBench consists of nine natural\nlanguage understanding datasets, six text generation datasets, six knowledge\nand question answering tasks, and one mathematical reasoning task. We present\nresults comparing the performance of prompting LLMs to fine-tuned baselines\nbased on BERT and T5-style models. Our results suggest large gaps in\nperformance between high-resource languages, such as English, and African\nlanguages across most tasks; but performance also varies based on the\navailability of monolingual data resources. Our findings confirm that\nperformance on African languages continues to remain a hurdle for current LLMs,\nunderscoring the need for additional efforts to close this gap.\n https://mcgill-nlp.github.io/AfroBench/\n","authors":["Jessica Ojo","Odunayo Ogundepo","Akintunde Oladipo","Kelechi Ogueji","Jimmy Lin","Pontus Stenetorp","David Ifeoluwa Adelani"],"pdf_url":"https://arxiv.org/pdf/2311.07978v4.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2503.04417v1","updated":"2025-03-06T13:21:27Z","published":"2025-03-06T13:21:27Z","title":"From Idea to CAD: A Language Model-Driven Multi-Agent System for\n Collaborative Design","summary":" Creating digital models using Computer Aided Design (CAD) is a process that\nrequires in-depth expertise. In industrial product development, this process\ntypically involves entire teams of engineers, spanning requirements\nengineering, CAD itself, and quality assurance. We present an approach that\nmirrors this team structure with a Vision Language Model (VLM)-based Multi\nAgent System, with access to parametric CAD tooling and tool documentation.\nCombining agents for requirements engineering, CAD engineering, and\nvision-based quality assurance, a model is generated automatically from\nsketches and/ or textual descriptions. The resulting model can be refined\ncollaboratively in an iterative validation loop with the user. Our approach has\nthe potential to increase the effectiveness of design processes, both for\nindustry experts and for hobbyists who create models for 3D printing. We\ndemonstrate the potential of the architecture at the example of various design\ntasks and provide several ablations that show the benefits of the\narchitecture's individual components.\n","authors":["Felix Ocker","Stefan Menzel","Ahmed Sadik","Thiago Rios"],"pdf_url":"https://arxiv.org/pdf/2503.04417v1.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2503.04416v1","updated":"2025-03-06T13:18:37Z","published":"2025-03-06T13:18:37Z","title":"Learning Transformer-based World Models with Contrastive Predictive\n Coding","summary":" The DreamerV3 algorithm recently obtained remarkable performance across\ndiverse environment domains by learning an accurate world model based on\nRecurrent Neural Networks (RNNs). Following the success of model-based\nreinforcement learning algorithms and the rapid adoption of the Transformer\narchitecture for its superior training efficiency and favorable scaling\nproperties, recent works such as STORM have proposed replacing RNN-based world\nmodels with Transformer-based world models using masked self-attention.\nHowever, despite the improved training efficiency of these methods, their\nimpact on performance remains limited compared to the Dreamer algorithm,\nstruggling to learn competitive Transformer-based world models. In this work,\nwe show that the next state prediction objective adopted in previous approaches\nis insufficient to fully exploit the representation capabilities of\nTransformers. We propose to extend world model predictions to longer time\nhorizons by introducing TWISTER (Transformer-based World model wIth contraSTivE\nRepresentations), a world model using action-conditioned Contrastive Predictive\nCoding to learn high-level temporal feature representations and improve the\nagent performance. TWISTER achieves a human-normalized mean score of 162% on\nthe Atari 100k benchmark, setting a new record among state-of-the-art methods\nthat do not employ look-ahead search.\n","authors":["Maxime Burchi","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2503.04416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04412v1","updated":"2025-03-06T13:10:40Z","published":"2025-03-06T13:10:40Z","title":"Wider or Deeper? Scaling LLM Inference-Time Compute with Adaptive\n Branching Tree Search","summary":" Recent advances demonstrate that increasing inference-time computation can\nsignificantly boost the reasoning capabilities of large language models (LLMs).\nAlthough repeated sampling (i.e., generating multiple candidate outputs) is a\nhighly effective strategy, it does not leverage external feedback signals for\nrefinement, which are often available in tasks like coding. In this work, we\npropose $\\textit{Adaptive Branching Monte Carlo Tree Search (AB-MCTS)}$, a\nnovel inference-time framework that generalizes repeated sampling with\nprincipled multi-turn exploration and exploitation. At each node in the search\ntree, AB-MCTS dynamically decides whether to \"go wider\" by expanding new\ncandidate responses or \"go deeper\" by revisiting existing ones based on\nexternal feedback signals. We evaluate our method on complex coding and\nengineering tasks using frontier models. Empirical results show that AB-MCTS\nconsistently outperforms both repeated sampling and standard MCTS, underscoring\nthe importance of combining the response diversity of LLMs with multi-turn\nsolution refinement for effective inference-time scaling.\n","authors":["Kou Misaki","Yuichi Inoue","Yuki Imajuku","So Kuroki","Taishi Nakamura","Takuya Akiba"],"pdf_url":"https://arxiv.org/pdf/2503.04412v1.pdf","comment":"To appear at ICLR 2025 Workshop on Foundation Models in the Wild"},{"id":"http://arxiv.org/abs/2503.04406v1","updated":"2025-03-06T13:00:53Z","published":"2025-03-06T13:00:53Z","title":"Training-Free Graph Filtering via Multimodal Feature Refinement for\n Extremely Fast Multimodal Recommendation","summary":" Multimodal recommender systems improve the performance of canonical\nrecommender systems with no item features by utilizing diverse content types\nsuch as text, images, and videos, while alleviating inherent sparsity of\nuser-item interactions and accelerating user engagement. However, current\nneural network-based models often incur significant computational overhead due\nto the complex training process required to learn and integrate information\nfrom multiple modalities. To overcome this limitation, we propose\nMultiModal-Graph Filtering (MM-GF), a training-free method based on the notion\nof graph filtering (GF) for efficient and accurate multimodal recommendations.\nSpecifically, MM-GF first constructs multiple similarity graphs through\nnontrivial multimodal feature refinement such as robust scaling and vector\nshifting by addressing the heterogeneous characteristics across modalities.\nThen, MM-GF optimally fuses multimodal information using linear low-pass\nfilters across different modalities. Extensive experiments on real-world\nbenchmark datasets demonstrate that MM-GF not only improves recommendation\naccuracy by up to 13.35% compared to the best competitor but also dramatically\nreduces computational costs by achieving the runtime of less than 10 seconds.\n","authors":["Yu-Seung Roh","Joo-Young Kim","Jin-Duk Park","Won-Yong Shin"],"pdf_url":"https://arxiv.org/pdf/2503.04406v1.pdf","comment":"10 pages, 6 figures, 6 tables"},{"id":"http://arxiv.org/abs/2406.12753v2","updated":"2025-03-06T12:55:25Z","published":"2024-06-18T16:20:53Z","title":"OlympicArena: Benchmarking Multi-discipline Cognitive Reasoning for\n Superintelligent AI","summary":" The evolution of Artificial Intelligence (AI) has been significantly\naccelerated by advancements in Large Language Models (LLMs) and Large\nMultimodal Models (LMMs), gradually showcasing potential cognitive reasoning\nabilities in problem-solving and scientific discovery (i.e., AI4Science) once\nexclusive to human intellect. To comprehensively evaluate current models'\nperformance in cognitive reasoning abilities, we introduce OlympicArena, which\nincludes 11,163 bilingual problems across both text-only and interleaved\ntext-image modalities. These challenges encompass a wide range of disciplines\nspanning seven fields and 62 international Olympic competitions, rigorously\nexamined for data leakage. We argue that the challenges in Olympic competition\nproblems are ideal for evaluating AI's cognitive reasoning due to their\ncomplexity and interdisciplinary nature, which are essential for tackling\ncomplex scientific challenges and facilitating discoveries. Beyond evaluating\nperformance across various disciplines using answer-only criteria, we conduct\ndetailed experiments and analyses from multiple perspectives. We delve into the\nmodels' cognitive reasoning abilities, their performance across different\nmodalities, and their outcomes in process-level evaluations, which are vital\nfor tasks requiring complex reasoning with lengthy solutions. Our extensive\nevaluations reveal that even advanced models like GPT-4o only achieve a 39.97%\noverall accuracy, illustrating current AI limitations in complex reasoning and\nmultimodal integration. Through the OlympicArena, we aim to advance AI towards\nsuperintelligence, equipping it to address more complex challenges in science\nand beyond. We also provide a comprehensive set of resources to support AI\nresearch, including a benchmark dataset, an open-source annotation platform, a\ndetailed evaluation tool, and a leaderboard with automatic submission features.\n","authors":["Zhen Huang","Zengzhi Wang","Shijie Xia","Xuefeng Li","Haoyang Zou","Ruijie Xu","Run-Ze Fan","Lyumanshan Ye","Ethan Chern","Yixin Ye","Yikai Zhang","Yuqing Yang","Ting Wu","Binjie Wang","Shichao Sun","Yang Xiao","Yiyuan Li","Fan Zhou","Steffi Chern","Yiwei Qin","Yan Ma","Jiadi Su","Yixiu Liu","Yuxiang Zheng","Shaoting Zhang","Dahua Lin","Yu Qiao","Pengfei Liu"],"pdf_url":"https://arxiv.org/pdf/2406.12753v2.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2404.05569v3","updated":"2025-03-06T12:54:37Z","published":"2024-04-08T14:43:13Z","title":"360$^\\circ$REA: Towards A Reusable Experience Accumulation with\n 360° Assessment for Multi-Agent System","summary":" Large language model agents have demonstrated remarkable advancements across\nvarious complex tasks. Recent works focus on optimizing the agent team or\nemploying self-reflection to iteratively solve complex tasks. Since these\nagents are all based on the same LLM, only conducting self-evaluation or\nremoving underperforming agents does not substantively enhance the capability\nof the agents. We argue that a comprehensive evaluation and accumulating\nexperience from evaluation feedback is an effective approach to improving\nsystem performance. In this paper, we propose Reusable Experience Accumulation\nwith 360$^\\circ$ Assessment (360$^\\circ$REA), a hierarchical multi-agent\nframework inspired by corporate organizational practices. The framework employs\na novel 360$^\\circ$ performance assessment method for multi-perspective\nperformance evaluation with fine-grained assessment. To enhance the capability\nof agents in addressing complex tasks, we introduce dual-level experience pool\nfor agents to accumulate experience through fine-grained assessment. Extensive\nexperiments on complex task datasets demonstrate the effectiveness of\n360$^\\circ$REA.\n","authors":["Shen Gao","Hao Li","Chengrui Huang","Quan Tu","Zhiliang Tian","Minlie Huang","Shuo Shang"],"pdf_url":"https://arxiv.org/pdf/2404.05569v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04398v1","updated":"2025-03-06T12:52:22Z","published":"2025-03-06T12:52:22Z","title":"Speculative MoE: Communication Efficient Parallel MoE Inference with\n Speculative Token and Expert Pre-scheduling","summary":" MoE (Mixture of Experts) prevails as a neural architecture that can scale\nmodern transformer-based LLMs (Large Language Models) to unprecedented scales.\nNevertheless, large MoEs' great demands of computing power, memory capacity and\nmemory bandwidth make scalable serving a fundamental challenge and efficient\nparallel inference has become a requisite to attain adequate throughput under\nlatency constraints. DeepSpeed-MoE, one state-of-the-art MoE inference\nframework, adopts a 3D-parallel paradigm including EP (Expert Parallelism), TP\n(Tensor Parallel) and DP (Data Parallelism). However, our analysis shows\nDeepSpeed-MoE's inference efficiency is largely bottlenecked by EP, which is\nimplemented with costly all-to-all collectives to route token activation. Our\nwork aims to boost DeepSpeed-MoE by strategically reducing EP's communication\noverhead with a technique named Speculative MoE. Speculative MoE has two\nspeculative parallelization schemes, speculative token shuffling and\nspeculative expert grouping, which predict outstanding tokens' expert routing\npaths and pre-schedule tokens and experts across devices to losslessly trim\nEP's communication volume. Besides DeepSpeed-MoE, we also build Speculative MoE\ninto a prevailing MoE inference engine SGLang. Experiments show Speculative MoE\ncan significantly boost state-of-the-art MoE inference frameworks on fast\nhomogeneous and slow heterogeneous interconnects.\n","authors":["Yan Li","Pengfei Zheng","Shuang Chen","Zewei Xu","Yunfei Du","Zhengang Wang"],"pdf_url":"https://arxiv.org/pdf/2503.04398v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.20742v2","updated":"2025-03-06T12:50:44Z","published":"2025-02-28T05:47:34Z","title":"Structured Preference Optimization for Vision-Language Long-Horizon Task\n Planning","summary":" Existing methods for vision-language task planning excel in short-horizon\ntasks but often fall short in complex, long-horizon planning within dynamic\nenvironments. These challenges primarily arise from the difficulty of\neffectively training models to produce high-quality reasoning processes for\nlong-horizon tasks. To address this, we propose Structured Preference\nOptimization (SPO), which aims to enhance reasoning and action selection in\nlong-horizon task planning through structured preference evaluation and\noptimized training strategies. Specifically, SPO introduces: 1)\nPreference-Based Scoring and Optimization, which systematically evaluates\nreasoning chains based on task relevance, visual grounding, and historical\nconsistency; and 2) Curriculum-Guided Training, where the model progressively\nadapts from simple to complex tasks, improving its generalization ability in\nlong-horizon scenarios and enhancing reasoning robustness. To advance research\nin vision-language long-horizon task planning, we introduce ExtendaBench, a\ncomprehensive benchmark covering 1,509 tasks across VirtualHome and Habitat\n2.0, categorized into ultra-short, short, medium, and long tasks. Experimental\nresults demonstrate that SPO significantly improves reasoning quality and final\ndecision accuracy, outperforming prior methods on long-horizon tasks and\nunderscoring the effectiveness of preference-driven optimization in\nvision-language task planning. Specifically, SPO achieves a +5.98% GCR and\n+4.68% SR improvement in VirtualHome and a +3.30% GCR and +2.11% SR improvement\nin Habitat over the best-performing baselines.\n","authors":["Xiwen Liang","Min Lin","Weiqi Ruan","Rongtao Xu","Yuecheng Liu","Jiaqi Chen","Bingqian Lin","Yuzheng Zhuang","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2502.20742v2.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2503.04392v1","updated":"2025-03-06T12:41:54Z","published":"2025-03-06T12:41:54Z","title":"AgentSafe: Safeguarding Large Language Model-based Multi-agent Systems\n via Hierarchical Data Management","summary":" Large Language Model based multi-agent systems are revolutionizing autonomous\ncommunication and collaboration, yet they remain vulnerable to security threats\nlike unauthorized access and data breaches. To address this, we introduce\nAgentSafe, a novel framework that enhances MAS security through hierarchical\ninformation management and memory protection. AgentSafe classifies information\nby security levels, restricting sensitive data access to authorized agents.\nAgentSafe incorporates two components: ThreatSieve, which secures communication\nby verifying information authority and preventing impersonation, and\nHierarCache, an adaptive memory management system that defends against\nunauthorized access and malicious poisoning, representing the first systematic\ndefense for agent memory. Experiments across various LLMs show that AgentSafe\nsignificantly boosts system resilience, achieving defense success rates above\n80% under adversarial conditions. Additionally, AgentSafe demonstrates\nscalability, maintaining robust performance as agent numbers and information\ncomplexity grow. Results underscore effectiveness of AgentSafe in securing MAS\nand its potential for real-world application.\n","authors":["Junyuan Mao","Fanci Meng","Yifan Duan","Miao Yu","Xiaojun Jia","Junfeng Fang","Yuxuan Liang","Kun Wang","Qingsong Wen"],"pdf_url":"https://arxiv.org/pdf/2503.04392v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.07918v2","updated":"2025-03-06T12:41:21Z","published":"2024-07-07T12:41:40Z","title":"Detecting new obfuscated malware variants: A lightweight and\n interpretable machine learning approach","summary":" Machine learning has been successfully applied in developing malware\ndetection systems, with a primary focus on accuracy, and increasing attention\nto reducing computational overhead and improving model interpretability.\nHowever, an important question remains underexplored: How well can machine\nlearning-based models detect entirely new forms of malware not present in the\ntraining data? In this study, we present a machine learning-based system for\ndetecting obfuscated malware that is not only highly accurate, lightweight and\ninterpretable, but also capable of successfully adapting to new types of\nmalware attacks. Our system is capable of detecting 15 malware subtypes despite\nbeing exclusively trained on one malware subtype, namely the Transponder from\nthe Spyware family. This system was built after training 15 distinct random\nforest-based models, each on a different malware subtype from the\nCIC-MalMem-2022 dataset. These models were evaluated against the entire range\nof malware subtypes, including all unseen malware subtypes. To maintain the\nsystem's streamlined nature, training was confined to the top five most\nimportant features, which also enhanced interpretability. The\nTransponder-focused model exhibited high accuracy, exceeding 99.8%, with an\naverage processing speed of 5.7 microseconds per file. We also illustrate how\nthe Shapley additive explanations technique can facilitate the interpretation\nof the model predictions. Our research contributes to advancing malware\ndetection methodologies, pioneering the feasibility of detecting obfuscated\nmalware by exclusively training a model on a single or a few carefully selected\nmalware subtypes and applying it to detect unseen subtypes.\n","authors":["Oladipo A. Madamidola","Felix Ngobigha","Adnane Ez-zizi"],"pdf_url":"https://arxiv.org/pdf/2407.07918v2.pdf","comment":"30 pages (excluding Appendix), 5 figures and 5 tables. Now published\n in Intelligent Systems with Applications\n (https://doi.org/10.1016/j.iswa.2024.200472)"},{"id":"http://arxiv.org/abs/2410.21083v2","updated":"2025-03-06T12:38:42Z","published":"2024-10-28T14:48:05Z","title":"Stealthy Jailbreak Attacks on Large Language Models via Benign Data\n Mirroring","summary":" Large language model (LLM) safety is a critical issue, with numerous studies\nemploying red team testing to enhance model security. Among these, jailbreak\nmethods explore potential vulnerabilities by crafting malicious prompts that\ninduce model outputs contrary to safety alignments. Existing black-box\njailbreak methods often rely on model feedback, repeatedly submitting queries\nwith detectable malicious instructions during the attack search process.\nAlthough these approaches are effective, the attacks may be intercepted by\ncontent moderators during the search process. We propose an improved transfer\nattack method that guides malicious prompt construction by locally training a\nmirror model of the target black-box model through benign data distillation.\nThis method offers enhanced stealth, as it does not involve submitting\nidentifiable malicious instructions to the target model during the search\nphase. Our approach achieved a maximum attack success rate of 92%, or a\nbalanced value of 80% with an average of 1.5 detectable jailbreak queries per\nsample against GPT-3.5 Turbo on a subset of AdvBench. These results underscore\nthe need for more robust defense mechanisms.\n","authors":["Honglin Mu","Han He","Yuxin Zhou","Yunlong Feng","Yang Xu","Libo Qin","Xiaoming Shi","Zeming Liu","Xudong Han","Qi Shi","Qingfu Zhu","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2410.21083v2.pdf","comment":"Accepted by NAACL 2025"},{"id":"http://arxiv.org/abs/2502.07527v2","updated":"2025-03-06T12:34:23Z","published":"2025-02-11T13:08:03Z","title":"Nature Language Model: Deciphering the Language of Nature for Scientific\n Discovery","summary":" Foundation models have revolutionized natural language processing and\nartificial intelligence, significantly enhancing how machines comprehend and\ngenerate human languages. Inspired by the success of these foundation models,\nresearchers have developed foundation models for individual scientific domains,\nincluding small molecules, materials, proteins, DNA, RNA and even cells.\nHowever, these models are typically trained in isolation, lacking the ability\nto integrate across different scientific domains. Recognizing that entities\nwithin these domains can all be represented as sequences, which together form\nthe \"language of nature\", we introduce Nature Language Model (NatureLM), a\nsequence-based science foundation model designed for scientific discovery.\nPre-trained with data from multiple scientific domains, NatureLM offers a\nunified, versatile model that enables various applications including: (i)\ngenerating and optimizing small molecules, proteins, RNA, and materials using\ntext instructions; (ii) cross-domain generation/design, such as\nprotein-to-molecule and protein-to-RNA generation; and (iii) top performance\nacross different domains, matching or surpassing state-of-the-art specialist\nmodels. NatureLM offers a promising generalist approach for various scientific\ntasks, including drug discovery (hit generation/optimization, ADMET\noptimization, synthesis), novel material design, and the development of\ntherapeutic proteins or nucleotides. We have developed NatureLM models in\ndifferent sizes (1 billion, 8 billion, and 46.7 billion parameters) and\nobserved a clear improvement in performance as the model size increases.\n","authors":["Yingce Xia","Peiran Jin","Shufang Xie","Liang He","Chuan Cao","Renqian Luo","Guoqing Liu","Yue Wang","Zequn Liu","Yuan-Jyue Chen","Zekun Guo","Yeqi Bai","Pan Deng","Yaosen Min","Ziheng Lu","Hongxia Hao","Han Yang","Jielan Li","Chang Liu","Jia Zhang","Jianwei Zhu","Ran Bi","Kehan Wu","Wei Zhang","Kaiyuan Gao","Qizhi Pei","Qian Wang","Xixian Liu","Yanting Li","Houtian Zhu","Yeqing Lu","Mingqian Ma","Zun Wang","Tian Xie","Krzysztof Maziarz","Marwin Segler","Zhao Yang","Zilong Chen","Yu Shi","Shuxin Zheng","Lijun Wu","Chen Hu","Peggy Dai","Tie-Yan Liu","Haiguang Liu","Tao Qin"],"pdf_url":"https://arxiv.org/pdf/2502.07527v2.pdf","comment":"93 pages"},{"id":"http://arxiv.org/abs/2503.04378v1","updated":"2025-03-06T12:30:24Z","published":"2025-03-06T12:30:24Z","title":"Dedicated Feedback and Edit Models Empower Inference-Time Scaling for\n Open-Ended General-Domain Tasks","summary":" Inference-Time Scaling has been critical to the success of recent models such\nas OpenAI o1 and DeepSeek R1. However, many techniques used to train models for\ninference-time scaling require tasks to have answers that can be verified,\nlimiting their application to domains such as math, coding and logical\nreasoning. We take inspiration from how humans make first attempts, ask for\ndetailed feedback from others and make improvements based on such feedback\nacross a wide spectrum of open-ended endeavors. To this end, we collect data\nfor and train dedicated Feedback and Edit Models that are capable of performing\ninference-time scaling for open-ended general-domain tasks. In our setup, one\nmodel generates an initial response, which are given feedback by a second\nmodel, that are then used by a third model to edit the response. We show that\nperformance on Arena Hard, a benchmark strongly predictive of Chatbot Arena Elo\ncan be boosted by scaling the number of initial response drafts, effective\nfeedback and edited responses. When scaled optimally, our setup based on 70B\nmodels from the Llama 3 family can reach SoTA performance on Arena Hard at 92.7\nas of 5 Mar 2025, surpassing OpenAI o1-preview-2024-09-12 with 90.4 and\nDeepSeek R1 with 92.3.\n","authors":["Zhilin Wang","Jiaqi Zeng","Olivier Delalleau","Daniel Egert","Ellie Evans","Hoo-Chang Shin","Felipe Soares","Yi Dong","Oleksii Kuchaiev"],"pdf_url":"https://arxiv.org/pdf/2503.04378v1.pdf","comment":"22 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.10329v2","updated":"2025-03-06T12:16:09Z","published":"2024-09-16T14:39:15Z","title":"InfoDisent: Explainability of Image Classification Models by Information\n Disentanglement","summary":" In this work, we introduce InfoDisent, a hybrid approach to explainability\nbased on the information bottleneck principle. InfoDisent enables the\ndisentanglement of information in the final layer of any pretrained model into\natomic concepts, which can be interpreted as prototypical parts. This approach\nmerges the flexibility of post-hoc methods with the concept-level modeling\ncapabilities of self-explainable neural networks, such as ProtoPNets. We\ndemonstrate the effectiveness of InfoDisent through computational experiments\nand user studies across various datasets using modern backbones such as ViTs\nand convolutional networks. Notably, InfoDisent generalizes the prototypical\nparts approach to novel domains (ImageNet).\n","authors":["Łukasz Struski","Dawid Rymarczyk","Jacek Tabor"],"pdf_url":"https://arxiv.org/pdf/2409.10329v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.01257v2","updated":"2025-03-06T12:13:14Z","published":"2024-10-02T06:05:52Z","title":"HelpSteer2-Preference: Complementing Ratings with Preferences","summary":" Reward models are critical for aligning models to follow instructions, and\nare typically trained following one of two popular paradigms: Bradley-Terry\nstyle or Regression style. However, there is a lack of evidence that either\napproach is better than the other, when adequately matched for data. This is\nprimarily because these approaches require data collected in different (but\nincompatible) formats, meaning that adequately matched data is not available in\nexisting public datasets. To tackle this problem, we release preference\nannotations (designed for Bradley-Terry training) to complement existing\nratings (designed for Regression style training) in the HelpSteer2 dataset. To\nimprove data interpretability, preference annotations are accompanied with\nhuman-written justifications. Using this data, we conduct the first\nhead-to-head comparison of Bradley-Terry and Regression models when adequately\nmatched for data. Based on insights derived from such a comparison, we propose\na novel approach to combine Bradley-Terry and Regression reward modeling. A\nLlama-3.1-70B-Instruct model tuned with this approach scores 94.1 on\nRewardBench, emerging top of more than 140 reward models as of 1 Oct 2024. This\nreward model can then be used with REINFORCE algorithm (RLHF) to align an\nInstruct model to reach 85.0 on Arena Hard, which is No. 1 as of 1 Oct 2024. We\nopen-source this dataset (CC-BY-4.0 license) at\nhttps://huggingface.co/datasets/nvidia/HelpSteer2#preferences-new -- 1-oct-2024\nand openly release the trained Reward and Instruct models at\nhttps://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Reward and\nhttps://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct\n","authors":["Zhilin Wang","Alexander Bukharin","Olivier Delalleau","Daniel Egert","Gerald Shen","Jiaqi Zeng","Oleksii Kuchaiev","Yi Dong"],"pdf_url":"https://arxiv.org/pdf/2410.01257v2.pdf","comment":"Accepted to ICLR 2025; 28 pages, 3 figures"},{"id":"http://arxiv.org/abs/2503.04363v1","updated":"2025-03-06T12:06:54Z","published":"2025-03-06T12:06:54Z","title":"Causally Reliable Concept Bottleneck Models","summary":" Concept-based models are an emerging paradigm in deep learning that\nconstrains the inference process to operate through human-interpretable\nconcepts, facilitating explainability and human interaction. However, these\narchitectures, on par with popular opaque neural models, fail to account for\nthe true causal mechanisms underlying the target phenomena represented in the\ndata. This hampers their ability to support causal reasoning tasks, limits\nout-of-distribution generalization, and hinders the implementation of fairness\nconstraints. To overcome these issues, we propose \\emph{Causally reliable\nConcept Bottleneck Models} (C$^2$BMs), a class of concept-based architectures\nthat enforce reasoning through a bottleneck of concepts structured according to\na model of the real-world causal mechanisms. We also introduce a pipeline to\nautomatically learn this structure from observational data and\n\\emph{unstructured} background knowledge (e.g., scientific literature).\nExperimental evidence suggest that C$^2$BM are more interpretable, causally\nreliable, and improve responsiveness to interventions w.r.t. standard opaque\nand concept-based models, while maintaining their accuracy.\n","authors":["Giovanni De Felice","Arianna Casanova Flores","Francesco De Santis","Silvia Santini","Johannes Schneider","Pietro Barbiero","Alberto Termine"],"pdf_url":"https://arxiv.org/pdf/2503.04363v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04362v1","updated":"2025-03-06T12:04:56Z","published":"2025-03-06T12:04:56Z","title":"A Generalist Cross-Domain Molecular Learning Framework for\n Structure-Based Drug Discovery","summary":" Structure-based drug discovery (SBDD) is a systematic scientific process that\ndevelops new drugs by leveraging the detailed physical structure of the target\nprotein. Recent advancements in pre-trained models for biomolecules have\ndemonstrated remarkable success across various biochemical applications,\nincluding drug discovery and protein engineering. However, in most approaches,\nthe pre-trained models primarily focus on the characteristics of either small\nmolecules or proteins, without delving into their binding interactions which\nare essential cross-domain relationships pivotal to SBDD. To fill this gap, we\npropose a general-purpose foundation model named BIT (an abbreviation for\nBiomolecular Interaction Transformer), which is capable of encoding a range of\nbiochemical entities, including small molecules, proteins, and protein-ligand\ncomplexes, as well as various data formats, encompassing both 2D and 3D\nstructures. Specifically, we introduce Mixture-of-Domain-Experts (MoDE) to\nhandle the biomolecules from diverse biochemical domains and\nMixture-of-Structure-Experts (MoSE) to capture positional dependencies in the\nmolecular structures. The proposed mixture-of-experts approach enables BIT to\nachieve both deep fusion and domain-specific encoding, effectively capturing\nfine-grained molecular interactions within protein-ligand complexes. Then, we\nperform cross-domain pre-training on the shared Transformer backbone via\nseveral unified self-supervised denoising tasks. Experimental results on\nvarious benchmarks demonstrate that BIT achieves exceptional performance in\ndownstream tasks, including binding affinity prediction, structure-based\nvirtual screening, and molecular property prediction.\n","authors":["Yiheng Zhu","Mingyang Li","Junlong Liu","Kun Fu","Jiansheng Wu","Qiuyi Li","Mingze Yin","Jieping Ye","Jian Wu","Zheng Wang"],"pdf_url":"https://arxiv.org/pdf/2503.04362v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04357v1","updated":"2025-03-06T12:01:20Z","published":"2025-03-06T12:01:20Z","title":"scDD: Latent Codes Based scRNA-seq Dataset Distillation with Foundation\n Model Knowledge","summary":" Single-cell RNA sequencing (scRNA-seq) technology has profiled hundreds of\nmillions of human cells across organs, diseases, development and perturbations\nto date. However, the high-dimensional sparsity, batch effect noise, category\nimbalance, and ever-increasing data scale of the original sequencing data pose\nsignificant challenges for multi-center knowledge transfer, data fusion, and\ncross-validation between scRNA-seq datasets. To address these barriers, (1) we\nfirst propose a latent codes-based scRNA-seq dataset distillation framework\nnamed scDD, which transfers and distills foundation model knowledge and\noriginal dataset information into a compact latent space and generates\nsynthetic scRNA-seq dataset by a generator to replace the original dataset.\nThen, (2) we propose a single-step conditional diffusion generator named SCDG,\nwhich perform single-step gradient back-propagation to help scDD optimize\ndistillation quality and avoid gradient decay caused by multi-step\nback-propagation. Meanwhile, SCDG ensures the scRNA-seq data characteristics\nand inter-class discriminability of the synthetic dataset through flexible\nconditional control and generation quality assurance. Finally, we propose a\ncomprehensive benchmark to evaluate the performance of scRNA-seq dataset\ndistillation in different data analysis tasks. It is validated that our\nproposed method can achieve 7.61% absolute and 15.70% relative improvement over\nprevious state-of-the-art methods on average task.\n","authors":["Zhen Yu","Jianan Han","Yang Liu","Qingchao Chen"],"pdf_url":"https://arxiv.org/pdf/2503.04357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01334v3","updated":"2025-03-06T11:59:11Z","published":"2024-08-02T15:32:42Z","title":"A Backbone for Long-Horizon Robot Task Understanding","summary":" End-to-end robot learning, particularly for long-horizon tasks, often results\nin unpredictable outcomes and poor generalization. To address these challenges,\nwe propose a novel Therblig-Based Backbone Framework (TBBF) as a fundamental\nstructure to enhance interpretability, data efficiency, and generalization in\nrobotic systems. TBBF utilizes expert demonstrations to enable therblig-level\ntask decomposition, facilitate efficient action-object mapping, and generate\nadaptive trajectories for new scenarios. The approach consists of two stages:\noffline training and online testing. During the offline training stage, we\ndeveloped the Meta-RGate SynerFusion (MGSF) network for accurate therblig\nsegmentation across various tasks. In the online testing stage, after a\none-shot demonstration of a new task is collected, our MGSF network extracts\nhigh-level knowledge, which is then encoded into the image using Action\nRegistration (ActionREG). Additionally, Large Language Model (LLM)-Alignment\nPolicy for Visual Correction (LAP-VC) is employed to ensure precise action\nregistration, facilitating trajectory transfer in novel robot scenarios.\nExperimental results validate these methods, achieving 94.37% recall in\ntherblig segmentation and success rates of 94.4% and 80% in real-world online\nrobot testing for simple and complex scenarios, respectively. Supplementary\nmaterial is available at:\nhttps://sites.google.com/view/therbligsbasedbackbone/home\n","authors":["Xiaoshuai Chen","Wei Chen","Dongmyoung Lee","Yukun Ge","Nicolas Rojas","Petar Kormushev"],"pdf_url":"https://arxiv.org/pdf/2408.01334v3.pdf","comment":"8 pages, 8 figures. This work has been published by IEEE Robotics and\n Automation Letters (RA-L)"},{"id":"http://arxiv.org/abs/2407.12468v3","updated":"2025-03-06T11:53:49Z","published":"2024-07-17T10:40:39Z","title":"Evaluating Search Engines and Large Language Models for Answering Health\n Questions","summary":" Search engines (SEs) have traditionally been primary tools for information\nseeking, but the new Large Language Models (LLMs) are emerging as powerful\nalternatives, particularly for question-answering tasks. This study compares\nthe performance of four popular SEs, seven LLMs, and retrieval-augmented (RAG)\nvariants in answering 150 health-related questions from the TREC Health\nMisinformation (HM) Track. Results reveal SEs correctly answer between 50 and\n70% of questions, often hindered by many retrieval results not responding to\nthe health question. LLMs deliver higher accuracy, correctly answering about\n80% of questions, though their performance is sensitive to input prompts. RAG\nmethods significantly enhance smaller LLMs' effectiveness, improving accuracy\nby up to 30% by integrating retrieval evidence.\n","authors":["Marcos Fernández-Pichel","Juan C. Pichel","David E. Losada"],"pdf_url":"https://arxiv.org/pdf/2407.12468v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04343v1","updated":"2025-03-06T11:39:46Z","published":"2025-03-06T11:39:46Z","title":"Talking Back -- human input and explanations to interactive AI systems","summary":" While XAI focuses on providing AI explanations to humans, can the reverse -\nhumans explaining their judgments to AI - foster richer, synergistic human-AI\nsystems? This paper explores various forms of human inputs to AI and examines\nhow human explanations can guide machine learning models toward automated\njudgments and explanations that align more closely with human concepts.\n","authors":["Alan Dix","Tommaso Turchi","Ben Wilson","Anna Monreale","Matt Roach"],"pdf_url":"https://arxiv.org/pdf/2503.04343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12113v2","updated":"2025-03-06T11:33:28Z","published":"2024-01-22T16:51:01Z","title":"Extracting Formulae in Many-Valued Logic from Deep Neural Networks","summary":" We propose a new perspective on deep ReLU networks, namely as circuit\ncounterparts of Lukasiewicz infinite-valued logic -- a many-valued (MV)\ngeneralization of Boolean logic. An algorithm for extracting formulae in MV\nlogic from deep ReLU networks is presented. As the algorithm applies to\nnetworks with general, in particular also real-valued, weights, it can be used\nto extract logical formulae from deep ReLU networks trained on data.\n","authors":["Yani Zhang","Helmut Bölcskei"],"pdf_url":"https://arxiv.org/pdf/2401.12113v2.pdf","comment":"Signicant extension of the previous version"},{"id":"http://arxiv.org/abs/2503.04328v1","updated":"2025-03-06T11:27:55Z","published":"2025-03-06T11:27:55Z","title":"Solving Word-Sense Disambiguation and Word-Sense Induction with\n Dictionary Examples","summary":" Many less-resourced languages struggle with a lack of large, task-specific\ndatasets that are required for solving relevant tasks with modern\ntransformer-based large language models (LLMs). On the other hand, many\nlinguistic resources, such as dictionaries, are rarely used in this context\ndespite their large information contents. We show how LLMs can be used to\nextend existing language resources in less-resourced languages for two\nimportant tasks: word-sense disambiguation (WSD) and word-sense induction\n(WSI). We approach the two tasks through the related but much more accessible\nword-in-context (WiC) task where, given a pair of sentences and a target word,\na classification model is tasked with predicting whether the sense of a given\nword differs between sentences. We demonstrate that a well-trained model for\nthis task can distinguish between different word senses and can be adapted to\nsolve the WSD and WSI tasks. The advantage of using the WiC task, instead of\ndirectly predicting senses, is that the WiC task does not need pre-constructed\nsense inventories with a sufficient number of examples for each sense, which\nare rarely available in less-resourced languages. We show that sentence pairs\nfor the WiC task can be successfully generated from dictionary examples using\nLLMs. The resulting prediction models outperform existing models on WiC, WSD,\nand WSI tasks. We demonstrate our methodology on the Slovene language, where a\nmonolingual dictionary is available, but word-sense resources are tiny.\n","authors":["Tadej Škvorc","Marko Robnik-Šikonja"],"pdf_url":"https://arxiv.org/pdf/2503.04328v1.pdf","comment":"12 pages, 1 figure"},{"id":"http://arxiv.org/abs/2412.00156v3","updated":"2025-03-06T11:05:32Z","published":"2024-11-29T08:10:49Z","title":"VISION-XL: High Definition Video Inverse Problem Solver using Latent\n Image Diffusion Models","summary":" In this paper, we propose a novel framework for solving high-definition video\ninverse problems using latent image diffusion models. Building on recent\nadvancements in spatio-temporal optimization for video inverse problems using\nimage diffusion models, our approach leverages latent-space diffusion models to\nachieve enhanced video quality and resolution. To address the high\ncomputational demands of processing high-resolution frames, we introduce a\npseudo-batch consistent sampling strategy, allowing efficient operation on a\nsingle GPU. Additionally, to improve temporal consistency, we present\npseudo-batch inversion, an initialization technique that incorporates\ninformative latents from the measurement. By integrating with SDXL, our\nframework achieves state-of-the-art video reconstruction across a wide range of\nspatio-temporal inverse problems, including complex combinations of frame\naveraging and various spatial degradations, such as deblurring,\nsuper-resolution, and inpainting. Unlike previous methods, our approach\nsupports multiple aspect ratios (landscape, vertical, and square) and delivers\nHD-resolution reconstructions (exceeding 1280x720) in under 6 seconds per frame\non a single NVIDIA 4090 GPU.\n","authors":["Taesung Kwon","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2412.00156v3.pdf","comment":"Project page: https://vision-xl.github.io/"},{"id":"http://arxiv.org/abs/2501.10814v2","updated":"2025-03-06T11:05:23Z","published":"2025-01-18T16:23:09Z","title":"No More Sliding Window: Efficient 3D Medical Image Segmentation with\n Differentiable Top-k Patch Sampling","summary":" 3D models surpass 2D models in CT/MRI segmentation by effectively capturing\ninter-slice relationships. However, the added depth dimension substantially\nincreases memory consumption. While patch-based training alleviates memory\nconstraints, it significantly slows down the inference speed due to the sliding\nwindow (SW) approach. We propose No-More-Sliding-Window (NMSW), a novel\nend-to-end trainable framework that enhances the efficiency of generic 3D\nsegmentation backbone during an inference step by eliminating the need for SW.\nNMSW employs a differentiable Top-k module to selectively sample only the most\nrelevant patches, thereby minimizing redundant computations. When patch-level\npredictions are insufficient, the framework intelligently leverages coarse\nglobal predictions to refine results. Evaluated across 3 tasks using 3\nsegmentation backbones, NMSW achieves competitive accuracy compared to SW\ninference while significantly reducing computational complexity by 91% (88.0 to\n8.00 TMACs). Moreover, it delivers a 9.1x faster inference on the H100 GPU\n(99.0 to 8.3 sec) and a 11.1x faster inference on the Xeon Gold CPU (2110 to\n189 sec). NMSW is model-agnostic, further boosting efficiency when integrated\nwith any existing efficient segmentation backbones.\n","authors":["Young Seok Jeon","Hongfei Yang","Huazhu Fu","Mengling Feng"],"pdf_url":"https://arxiv.org/pdf/2501.10814v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.21057v2","updated":"2025-03-06T11:02:06Z","published":"2025-02-28T13:58:22Z","title":"Robust Deterministic Policy Gradient for Disturbance Attenuation and Its\n Application to Quadrotor Control","summary":" Practical control systems pose significant challenges in identifying optimal\ncontrol policies due to uncertainties in the system model and external\ndisturbances. While $H_\\infty$ control techniques are commonly used to design\nrobust controllers that mitigate the effects of disturbances, these methods\noften require complex and computationally intensive calculations. To address\nthis issue, this paper proposes a reinforcement learning algorithm called\nRobust Deterministic Policy Gradient (RDPG), which formulates the $H_\\infty$\ncontrol problem as a two-player zero-sum dynamic game. In this formulation, one\nplayer (the user) aims to minimize the cost, while the other player (the\nadversary) seeks to maximize it. We then employ deterministic policy gradient\n(DPG) and its deep reinforcement learning counterpart to train a robust control\npolicy with effective disturbance attenuation. In particular, for practical\nimplementation, we introduce an algorithm called robust deep deterministic\npolicy gradient (RDDPG), which employs a deep neural network architecture and\nintegrates techniques from the twin-delayed deep deterministic policy gradient\n(TD3) to enhance stability and learning efficiency. To evaluate the proposed\nalgorithm, we implement it on an unmanned aerial vehicle (UAV) tasked with\nfollowing a predefined path in a disturbance-prone environment. The\nexperimental results demonstrate that the proposed method outperforms other\ncontrol approaches in terms of robustness against disturbances, enabling\nprecise real-time tracking of moving targets even under severe disturbance\nconditions.\n","authors":["Taeho Lee","Donghwan Lee"],"pdf_url":"https://arxiv.org/pdf/2502.21057v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2503.03417v2","updated":"2025-03-06T11:00:35Z","published":"2025-03-05T11:47:32Z","title":"When Claims Evolve: Evaluating and Enhancing the Robustness of Embedding\n Models Against Misinformation Edits","summary":" Online misinformation remains a critical challenge, and fact-checkers\nincreasingly rely on embedding-based methods to retrieve relevant fact-checks.\nYet, when debunked claims reappear in edited forms, the performance of these\nmethods is unclear. In this work, we introduce a taxonomy of six common\nreal-world misinformation edits and propose a perturbation framework that\ngenerates valid, natural claim variations. Our multi-stage retrieval evaluation\nreveals that standard embedding models struggle with user-introduced edits,\nwhile LLM-distilled embeddings offer improved robustness at a higher\ncomputational cost. Although a strong reranker helps mitigate some issues, it\ncannot fully compensate for first-stage retrieval gaps. Addressing these\nretrieval gaps, our train- and inference-time mitigation approaches enhance\nin-domain robustness by up to 17 percentage points and boost out-of-domain\ngeneralization by 10 percentage points over baseline models. Overall, our\nfindings provide practical improvements to claim-matching systems, enabling\nmore reliable fact-checking of evolving misinformation.\n","authors":["Jabez Magomere","Emanuele La Malfa","Manuel Tonneau","Ashkan Kazemi","Scott Hale"],"pdf_url":"https://arxiv.org/pdf/2503.03417v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04315v1","updated":"2025-03-06T10:58:35Z","published":"2025-03-06T10:58:35Z","title":"Provable Robust Overfitting Mitigation in Wasserstein Distributionally\n Robust Optimization","summary":" Wasserstein distributionally robust optimization (WDRO) optimizes against\nworst-case distributional shifts within a specified uncertainty set, leading to\nenhanced generalization on unseen adversarial examples, compared to standard\nadversarial training which focuses on pointwise adversarial perturbations.\nHowever, WDRO still suffers fundamentally from the robust overfitting problem,\nas it does not consider statistical error. We address this gap by proposing a\nnovel robust optimization framework under a new uncertainty set for adversarial\nnoise via Wasserstein distance and statistical error via Kullback-Leibler\ndivergence, called the Statistically Robust WDRO. We establish a robust\ngeneralization bound for the new optimization framework, implying that\nout-of-distribution adversarial performance is at least as good as the\nstatistically robust training loss with high probability. Furthermore, we\nderive conditions under which Stackelberg and Nash equilibria exist between the\nlearner and the adversary, giving an optimal robust model in certain sense.\nFinally, through extensive experiments, we demonstrate that our method\nsignificantly mitigates robust overfitting and enhances robustness within the\nframework of WDRO.\n","authors":["Shuang Liu","Yihan Wang","Yifan Zhu","Yibo Miao","Xiao-Shan Gao"],"pdf_url":"https://arxiv.org/pdf/2503.04315v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04302v1","updated":"2025-03-06T10:42:18Z","published":"2025-03-06T10:42:18Z","title":"Malware Detection at the Edge with Lightweight LLMs: A Performance\n Evaluation","summary":" The rapid evolution of malware attacks calls for the development of\ninnovative detection methods, especially in resource-constrained edge\ncomputing. Traditional detection techniques struggle to keep up with modern\nmalware's sophistication and adaptability, prompting a shift towards advanced\nmethodologies like those leveraging Large Language Models (LLMs) for enhanced\nmalware detection. However, deploying LLMs for malware detection directly at\nedge devices raises several challenges, including ensuring accuracy in\nconstrained environments and addressing edge devices' energy and computational\nlimits. To tackle these challenges, this paper proposes an architecture\nleveraging lightweight LLMs' strengths while addressing limitations like\nreduced accuracy and insufficient computational power. To evaluate the\neffectiveness of the proposed lightweight LLM-based approach for edge\ncomputing, we perform an extensive experimental evaluation using several\nstate-of-the-art lightweight LLMs. We test them with several publicly available\ndatasets specifically designed for edge and IoT scenarios and different edge\nnodes with varying computational power and characteristics.\n","authors":["Christian Rondanini","Barbara Carminati","Elena Ferrari","Antonio Gaudiano","Ashish Kundu"],"pdf_url":"https://arxiv.org/pdf/2503.04302v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01924v2","updated":"2025-03-06T10:39:48Z","published":"2024-05-03T08:34:13Z","title":"Semi-Parametric Retrieval via Binary Bag-of-Tokens Index","summary":" Information retrieval has transitioned from standalone systems into essential\ncomponents across broader applications, with indexing efficiency,\ncost-effectiveness, and freshness becoming increasingly critical yet often\noverlooked. In this paper, we introduce SemI-parametric Disentangled Retrieval\n(SiDR), a bi-encoder retrieval framework that decouples retrieval index from\nneural parameters to enable efficient, low-cost, and parameter-agnostic\nindexing for emerging use cases. Specifically, in addition to using embeddings\nas indexes like existing neural retrieval methods, SiDR supports a\nnon-parametric tokenization index for search, achieving BM25-like indexing\ncomplexity with significantly better effectiveness. Our comprehensive\nevaluation across 16 retrieval benchmarks demonstrates that SiDR outperforms\nboth neural and term-based retrieval baselines under the same indexing\nworkload: (i) When using an embedding-based index, SiDR exceeds the performance\nof conventional neural retrievers while maintaining similar training\ncomplexity; (ii) When using a tokenization-based index, SiDR drastically\nreduces indexing cost and time, matching the complexity of traditional\nterm-based retrieval, while consistently outperforming BM25 on all in-domain\ndatasets; (iii) Additionally, we introduce a late parametric mechanism that\nmatches BM25 index preparation time while outperforming other neural retrieval\nbaselines in effectiveness.\n","authors":["Jiawei Zhou","Li Dong","Furu Wei","Lei Chen"],"pdf_url":"https://arxiv.org/pdf/2405.01924v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04299v1","updated":"2025-03-06T10:39:47Z","published":"2025-03-06T10:39:47Z","title":"Mapping AI Benchmark Data to Quantitative Risk Estimates Through Expert\n Elicitation","summary":" The literature and multiple experts point to many potential risks from large\nlanguage models (LLMs), but there are still very few direct measurements of the\nactual harms posed. AI risk assessment has so far focused on measuring the\nmodels' capabilities, but the capabilities of models are only indicators of\nrisk, not measures of risk. Better modeling and quantification of AI risk\nscenarios can help bridge this disconnect and link the capabilities of LLMs to\ntangible real-world harm. This paper makes an early contribution to this field\nby demonstrating how existing AI benchmarks can be used to facilitate the\ncreation of risk estimates. We describe the results of a pilot study in which\nexperts use information from Cybench, an AI benchmark, to generate probability\nestimates. We show that the methodology seems promising for this purpose, while\nnoting improvements that can be made to further strengthen its application in\nquantitative AI risk assessment.\n","authors":["Malcolm Murray","Henry Papadatos","Otter Quarks","Pierre-François Gimenez","Simeon Campos"],"pdf_url":"https://arxiv.org/pdf/2503.04299v1.pdf","comment":"23 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.14507v3","updated":"2025-03-06T10:26:32Z","published":"2024-08-24T16:54:08Z","title":"Prompt-Matcher: Leveraging Large Models to Reduce Uncertainty in Schema\n Matching Results","summary":" Schema matching is the process of identifying correspondences between the\nelements of two given schemata, essential for database management systems, data\nintegration, and data warehousing. For datasets across different scenarios, the\noptimal schema matching algorithm is different. For single algorithm,\nhyperparameter tuning also cases multiple results. All results assigned equal\nprobabilities are stored in probabilistic databases to facilitate uncertainty\nmanagement. The substantial degree of uncertainty diminishes the efficiency and\nreliability of data processing, thereby precluding the provision of more\naccurate information for decision-makers. To address this problem, we introduce\na new approach based on fine-grained correspondence verification with specific\nprompt of Large Language Model.\n Our approach is an iterative loop that consists of three main components: (1)\nthe correspondence selection algorithm, (2) correspondence verification, and\n(3) the update of probability distribution. The core idea is that\ncorrespondences intersect across multiple results, thereby linking the\nverification of correspondences to the reduction of uncertainty in candidate\nresults.\n The task of selecting an optimal correspondence set to maximize the\nanticipated uncertainty reduction within a fixed budgetary framework is\nestablished as an NP-hard problem. We propose a novel $(1-1/e)$-approximation\nalgorithm that significantly outperforms brute algorithm in terms of\ncomputational efficiency. To enhance correspondence verification, we have\ndeveloped two prompt templates that enable GPT-4 to achieve state-of-the-art\nperformance across two established benchmark datasets. Our comprehensive\nexperimental evaluation demonstrates the superior effectiveness and robustness\nof the proposed approach.\n","authors":["Longyu Feng","Huahang Li","Chen Jason Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.14507v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04291v1","updated":"2025-03-06T10:19:01Z","published":"2025-03-06T10:19:01Z","title":"MathMistake Checker: A Comprehensive Demonstration for Step-by-Step Math\n Problem Mistake Finding by Prompt-Guided LLMs","summary":" We propose a novel system, MathMistake Checker, designed to automate\nstep-by-step mistake finding in mathematical problems with lengthy answers\nthrough a two-stage process. The system aims to simplify grading, increase\nefficiency, and enhance learning experiences from a pedagogical perspective. It\nintegrates advanced technologies, including computer vision and the\nchain-of-thought capabilities of the latest large language models (LLMs). Our\nsystem supports open-ended grading without reference answers and promotes\npersonalized learning by providing targeted feedback. We demonstrate its\neffectiveness across various types of math problems, such as calculation and\nword problems.\n","authors":["Tianyang Zhang","Zhuoxuan Jiang","Haotian Zhang","Lin Lin","Shaohua Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.04291v1.pdf","comment":"Published in AAAI 2025"},{"id":"http://arxiv.org/abs/2503.04290v1","updated":"2025-03-06T10:17:52Z","published":"2025-03-06T10:17:52Z","title":"How Do Hackathons Foster Creativity? Towards AI Collaborative Evaluation\n of Creativity at Scale","summary":" Hackathons have become popular collaborative events for accelerating the\ndevelopment of creative ideas and prototypes. There are several case studies\nshowcasing creative outcomes across domains such as industry, education, and\nresearch. However, there are no large-scale studies on creativity in hackathons\nwhich can advance theory on how hackathon formats lead to creative outcomes. We\nconducted a computational analysis of 193,353 hackathon projects. By\noperationalizing creativity through usefulness and novelty, we refined our\ndataset to 10,363 projects, allowing us to analyze how participant\ncharacteristics, collaboration patterns, and hackathon setups influence the\ndevelopment of creative projects. The contribution of our paper is twofold: We\nidentified means for organizers to foster creativity in hackathons. We also\nexplore the use of large language models (LLMs) to augment the evaluation of\ncreative outcomes and discuss challenges and opportunities of doing this, which\nhas implications for creativity research at large.\n","authors":["Jeanette Falk","Yiyi Chen","Janet Rafner","Mike Zhang","Johannes Bjerva","Alexander Nolte"],"pdf_url":"https://arxiv.org/pdf/2503.04290v1.pdf","comment":"Accepted in Proceedings of the 2025 CHI Conference on Human Factors\n in Computing Systems"},{"id":"http://arxiv.org/abs/2503.04283v1","updated":"2025-03-06T10:09:20Z","published":"2025-03-06T10:09:20Z","title":"Explainable AI in Time-Sensitive Scenarios: Prefetched Offline\n Explanation Model","summary":" As predictive machine learning models become increasingly adopted and\nadvanced, their role has evolved from merely predicting outcomes to actively\nshaping them. This evolution has underscored the importance of Trustworthy AI,\nhighlighting the necessity to extend our focus beyond mere accuracy and toward\na comprehensive understanding of these models' behaviors within the specific\ncontexts of their applications. To further progress in explainability, we\nintroduce Poem, Prefetched Offline Explanation Model, a model-agnostic, local\nexplainability algorithm for image data. The algorithm generates exemplars,\ncounterexemplars and saliency maps to provide quick and effective explanations\nsuitable for time-sensitive scenarios. Leveraging an existing local algorithm,\n\\poem{} infers factual and counterfactual rules from data to create\nillustrative examples and opposite scenarios with an enhanced stability by\ndesign. A novel mechanism then matches incoming test points with an explanation\nbase and produces diverse exemplars, informative saliency maps and believable\ncounterexemplars. Experimental results indicate that Poem outperforms its\npredecessor Abele in speed and ability to generate more nuanced and varied\nexemplars alongside more insightful saliency maps and valuable\ncounterexemplars.\n","authors":["Fabio Michele Russo","Carlo Metta","Anna Monreale","Salvatore Rinzivillo","Fabio Pinelli"],"pdf_url":"https://arxiv.org/pdf/2503.04283v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04280v1","updated":"2025-03-06T10:08:44Z","published":"2025-03-06T10:08:44Z","title":"Towards Autonomous Reinforcement Learning for Real-World Robotic\n Manipulation with Large Language Models","summary":" Recent advancements in Large Language Models (LLMs) and Visual Language\nModels (VLMs) have significantly impacted robotics, enabling high-level\nsemantic motion planning applications. Reinforcement Learning (RL), a\ncomplementary paradigm, enables agents to autonomously optimize complex\nbehaviors through interaction and reward signals. However, designing effective\nreward functions for RL remains challenging, especially in real-world tasks\nwhere sparse rewards are insufficient and dense rewards require elaborate\ndesign. In this work, we propose Autonomous Reinforcement learning for Complex\nHumanInformed Environments (ARCHIE), an unsupervised pipeline leveraging GPT-4,\na pre-trained LLM, to generate reward functions directly from natural language\ntask descriptions. The rewards are used to train RL agents in simulated\nenvironments, where we formalize the reward generation process to enhance\nfeasibility. Additionally, GPT-4 automates the coding of task success criteria,\ncreating a fully automated, one-shot procedure for translating human-readable\ntext into deployable robot skills. Our approach is validated through extensive\nsimulated experiments on single-arm and bi-manual manipulation tasks using an\nABB YuMi collaborative robot, highlighting its practicality and effectiveness.\nTasks are demonstrated on the real robot setup.\n","authors":["Niccolò Turcato","Matteo Iovino","Aris Synodinos","Alberto Dalla Libera","Ruggero Carli","Pietro Falco"],"pdf_url":"https://arxiv.org/pdf/2503.04280v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04267v1","updated":"2025-03-06T09:56:07Z","published":"2025-03-06T09:56:07Z","title":"Prompt Programming: A Platform for Dialogue-based Computational Problem\n Solving with Generative AI Models","summary":" Computing students increasingly rely on generative AI tools for programming\nassistance, often without formal instruction or guidance. This highlights a\nneed to teach students how to effectively interact with AI models, particularly\nthrough natural language prompts, to generate and critically evaluate code for\nsolving computational tasks. To address this, we developed a novel platform for\nprompt programming that enables authentic dialogue-based interactions, supports\nproblems involving multiple interdependent functions, and offers on-request\nexecution of generated code. Data analysis from over 900 students in an\nintroductory programming course revealed high engagement, with the majority of\nprompts occurring within multi-turn dialogues. Problems with multiple\ninterdependent functions encouraged iterative refinement, with progression\ngraphs highlighting several common strategies. Students were highly selective\nabout the code they chose to test, suggesting that on-request execution of\ngenerated code promoted critical thinking. Given the growing importance of\nlearning dialogue-based programming with AI, we provide this tool as a publicly\naccessible resource, accompanied by a corpus of programming problems for\neducational use.\n","authors":["Victor-Alexandru Pădurean","Paul Denny","Alkis Gotovos","Adish Singla"],"pdf_url":"https://arxiv.org/pdf/2503.04267v1.pdf","comment":"Preprint of the ITiCSE'25 paper"},{"id":"http://arxiv.org/abs/2503.04262v1","updated":"2025-03-06T09:46:16Z","published":"2025-03-06T09:46:16Z","title":"Guidelines for Applying RL and MARL in Cybersecurity Applications","summary":" Reinforcement Learning (RL) and Multi-Agent Reinforcement Learning (MARL)\nhave emerged as promising methodologies for addressing challenges in automated\ncyber defence (ACD). These techniques offer adaptive decision-making\ncapabilities in high-dimensional, adversarial environments. This report\nprovides a structured set of guidelines for cybersecurity professionals and\nresearchers to assess the suitability of RL and MARL for specific use cases,\nconsidering factors such as explainability, exploration needs, and the\ncomplexity of multi-agent coordination. It also discusses key algorithmic\napproaches, implementation challenges, and real-world constraints, such as data\nscarcity and adversarial interference. The report further outlines open\nresearch questions, including policy optimality, agent cooperation levels, and\nthe integration of MARL systems into operational cybersecurity frameworks. By\nbridging theoretical advancements and practical deployment, these guidelines\naim to enhance the effectiveness of AI-driven cyber defence strategies.\n","authors":["Vasilios Mavroudis","Gregory Palmer","Sara Farmer","Kez Smithson Whitehead","David Foster","Adam Price","Ian Miles","Alberto Caron","Stephen Pasteris"],"pdf_url":"https://arxiv.org/pdf/2503.04262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04261v1","updated":"2025-03-06T09:44:18Z","published":"2025-03-06T09:44:18Z","title":"VirtualXAI: A User-Centric Framework for Explainability Assessment\n Leveraging GPT-Generated Personas","summary":" In today's data-driven era, computational systems generate vast amounts of\ndata that drive the digital transformation of industries, where Artificial\nIntelligence (AI) plays a key role. Currently, the demand for eXplainable AI\n(XAI) has increased to enhance the interpretability, transparency, and\ntrustworthiness of AI models. However, evaluating XAI methods remains\nchallenging: existing evaluation frameworks typically focus on quantitative\nproperties such as fidelity, consistency, and stability without taking into\naccount qualitative characteristics such as satisfaction and interpretability.\nIn addition, practitioners face a lack of guidance in selecting appropriate\ndatasets, AI models, and XAI methods -a major hurdle in human-AI collaboration.\nTo address these gaps, we propose a framework that integrates quantitative\nbenchmarking with qualitative user assessments through virtual personas based\non the \"Anthology\" of backstories of the Large Language Model (LLM). Our\nframework also incorporates a content-based recommender system that leverages\ndataset-specific characteristics to match new input data with a repository of\nbenchmarked datasets. This yields an estimated XAI score and provides tailored\nrecommendations for both the optimal AI model and the XAI method for a given\nscenario.\n","authors":["Georgios Makridis","Vasileios Koukos","Georgios Fatouros","Dimosthenis Kyriazis"],"pdf_url":"https://arxiv.org/pdf/2503.04261v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2503.04258v1","updated":"2025-03-06T09:39:36Z","published":"2025-03-06T09:39:36Z","title":"TAIL: Text-Audio Incremental Learning","summary":" Many studies combine text and audio to capture multi-modal information but\nthey overlook the model's generalization ability on new datasets. Introducing\nnew datasets may affect the feature space of the original dataset, leading to\ncatastrophic forgetting. Meanwhile, large model parameters can significantly\nimpact training performance. To address these limitations, we introduce a novel\ntask called Text-Audio Incremental Learning (TAIL) task for text-audio\nretrieval, and propose a new method, PTAT, Prompt Tuning for Audio-Text\nincremental learning. This method utilizes prompt tuning to optimize the model\nparameters while incorporating an audio-text similarity and feature\ndistillation module to effectively mitigate catastrophic forgetting. We\nbenchmark our method and previous incremental learning methods on AudioCaps,\nClotho, BBC Sound Effects and Audioset datasets, and our method outperforms\nprevious methods significantly, particularly demonstrating stronger resistance\nto forgetting on older datasets. Compared to the full-parameters Finetune\n(Sequential) method, our model only requires 2.42\\% of its parameters,\nachieving 4.46\\% higher performance.\n","authors":["Yingfei Sun","Xu Gu","Wei Ji","Hanbin Zhao","Hao Fei","Yifang Yin","Roger Zimmermann"],"pdf_url":"https://arxiv.org/pdf/2503.04258v1.pdf","comment":"4 figures, 5 tables"},{"id":"http://arxiv.org/abs/2503.04257v1","updated":"2025-03-06T09:39:09Z","published":"2025-03-06T09:39:09Z","title":"How to Move Your Dragon: Text-to-Motion Synthesis for Large-Vocabulary\n Objects","summary":" Motion synthesis for diverse object categories holds great potential for 3D\ncontent creation but remains underexplored due to two key challenges: (1) the\nlack of comprehensive motion datasets that include a wide range of high-quality\nmotions and annotations, and (2) the absence of methods capable of handling\nheterogeneous skeletal templates from diverse objects. To address these\nchallenges, we contribute the following: First, we augment the Truebones Zoo\ndataset, a high-quality animal motion dataset covering over 70 species, by\nannotating it with detailed text descriptions, making it suitable for\ntext-based motion synthesis. Second, we introduce rig augmentation techniques\nthat generate diverse motion data while preserving consistent dynamics,\nenabling models to adapt to various skeletal configurations. Finally, we\nredesign existing motion diffusion models to dynamically adapt to arbitrary\nskeletal templates, enabling motion synthesis for a diverse range of objects\nwith varying structures. Experiments show that our method learns to generate\nhigh-fidelity motions from textual descriptions for diverse and even unseen\nobjects, setting a strong foundation for motion synthesis across diverse object\ncategories and skeletal templates. Qualitative results are available on this\nlink: t2m4lvo.github.io\n","authors":["Wonkwang Lee","Jongwon Jeong","Taehong Moon","Hyeon-Jong Kim","Jaehyeon Kim","Gunhee Kim","Byeong-Uk Lee"],"pdf_url":"https://arxiv.org/pdf/2503.04257v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04256v1","updated":"2025-03-06T09:38:14Z","published":"2025-03-06T09:38:14Z","title":"Knowledge Retention for Continual Model-Based Reinforcement Learning","summary":" We propose DRAGO, a novel approach for continual model-based reinforcement\nlearning aimed at improving the incremental development of world models across\na sequence of tasks that differ in their reward functions but not the state\nspace or dynamics. DRAGO comprises two key components: Synthetic Experience\nRehearsal, which leverages generative models to create synthetic experiences\nfrom past tasks, allowing the agent to reinforce previously learned dynamics\nwithout storing data, and Regaining Memories Through Exploration, which\nintroduces an intrinsic reward mechanism to guide the agent toward revisiting\nrelevant states from prior tasks. Together, these components enable the agent\nto maintain a comprehensive and continually developing world model,\nfacilitating more effective learning and adaptation across diverse\nenvironments. Empirical evaluations demonstrate that DRAGO is able to preserve\nknowledge across tasks, achieving superior performance in various continual\nlearning scenarios.\n","authors":["Yixiang Sun","Haotian Fu","Michael Littman","George Konidaris"],"pdf_url":"https://arxiv.org/pdf/2503.04256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04249v1","updated":"2025-03-06T09:32:39Z","published":"2025-03-06T09:32:39Z","title":"How to Mitigate Overfitting in Weak-to-strong Generalization?","summary":" Aligning powerful AI models on tasks that surpass human evaluation\ncapabilities is the central problem of \\textbf{superalignment}. To address this\nproblem, weak-to-strong generalization aims to elicit the capabilities of\nstrong models through weak supervisors and ensure that the behavior of strong\nmodels aligns with the intentions of weak supervisors without unsafe behaviors\nsuch as deception. Although weak-to-strong generalization exhibiting certain\ngeneralization capabilities, strong models exhibit significant overfitting in\nweak-to-strong generalization: Due to the strong fit ability of strong models,\nerroneous labels from weak supervisors may lead to overfitting in strong\nmodels. In addition, simply filtering out incorrect labels may lead to a\ndegeneration in question quality, resulting in a weak generalization ability of\nstrong models on hard questions. To mitigate overfitting in weak-to-strong\ngeneralization, we propose a two-stage framework that simultaneously improves\nthe quality of supervision signals and the quality of input questions.\nExperimental results in three series of large language models and two\nmathematical benchmarks demonstrate that our framework significantly improves\nPGR compared to naive weak-to-strong generalization, even achieving up to 100\\%\nPGR on some models.\n","authors":["Junhao Shi","Qinyuan Cheng","Zhaoye Fei","Yining Zheng","Qipeng Guo","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2503.04249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07076v2","updated":"2025-03-06T09:13:28Z","published":"2024-11-11T15:51:48Z","title":"StoryTeller: Improving Long Video Description through Global\n Audio-Visual Character Identification","summary":" Existing large vision-language models (LVLMs) are largely limited to\nprocessing short, seconds-long videos and struggle with generating coherent\ndescriptions for extended video spanning minutes or more. Long video\ndescription introduces new challenges, such as consistent character\nidentification and plot-level descriptions incorporating both visual and audio\ninformation. To address these, we figure out audio-visual character\nidentification, matching character names to each dialogue, as a key factor. We\npropose StoryTeller, a system for generating dense descriptions of long videos,\nincorporating both low-level visual concepts and high-level plot information.\nStoryTeller uses a multimodal large language model that integrates visual,\naudio, and text modalities to perform audio-visual character identification on\nminute-long video clips. The results are then fed into a LVLM to enhance\nconsistency of video description. We validate our approach on movie description\ntasks and introduce MovieStory101, a dataset with dense descriptions for\nthree-minute movie clips. To evaluate long video descriptions, we create\nStoryQA, a large set of multiple-choice questions for MovieStory101 test set.\nWe assess descriptions by inputting them into GPT-4 to answer these questions,\nusing accuracy as an automatic evaluation metric. Experiments show that\nStoryTeller outperforms all open and closed-source baselines on StoryQA,\nachieving 9.5% higher accuracy than the strongest baseline, Gemini-1.5-pro, and\ndemonstrating a +15.56% advantage in human side-by-side evaluations.\nAdditionally, incorporating audio-visual character identification from\nStoryTeller improves the performance of all video description models, with\nGemini-1.5-pro and GPT-4o showing relative improvement of 5.5% and 13.0%,\nrespectively, in accuracy on StoryQA.\n","authors":["Yichen He","Yuan Lin","Jianchao Wu","Hanchong Zhang","Yuchen Zhang","Ruicheng Le"],"pdf_url":"https://arxiv.org/pdf/2411.07076v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04231v1","updated":"2025-03-06T09:12:43Z","published":"2025-03-06T09:12:43Z","title":"One-Shot Clustering for Federated Learning","summary":" Federated Learning (FL) is a widespread and well adopted paradigm of\ndecentralized learning that allows training one model from multiple sources\nwithout the need to directly transfer data between participating clients. Since\nits inception in 2015, it has been divided into numerous sub-fields that deal\nwith application-specific issues, be it data heterogeneity or resource\nallocation. One such sub-field, Clustered Federated Learning (CFL), is dealing\nwith the problem of clustering the population of clients into separate cohorts\nto deliver personalized models. Although few remarkable works have been\npublished in this domain, the problem is still largely unexplored, as its basic\nassumption and settings are slightly different from standard FL. In this work,\nwe present One-Shot Clustered Federated Learning (OCFL), a clustering-agnostic\nalgorithm that can automatically detect the earliest suitable moment for\nclustering. Our algorithm is based on the computation of cosine similarity\nbetween gradients of the clients and a temperature measure that detects when\nthe federated model starts to converge. We empirically evaluate our methodology\nby testing various one-shot clustering algorithms for over thirty different\ntasks on three benchmark datasets. Our experiments showcase the good\nperformance of our approach when used to perform CFL in an automated manner\nwithout the need to adjust hyperparameters.\n","authors":["Maciej Krzysztof Zuziak","Roberto Pellungrini","Salvatore Rinzivillo"],"pdf_url":"https://arxiv.org/pdf/2503.04231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14153v3","updated":"2025-03-06T09:00:18Z","published":"2024-08-26T09:55:34Z","title":"Explaining Caption-Image Interactions in CLIP models with Second-Order\n Attributions","summary":" Dual encoder architectures like CLIP models map two types of inputs into a\nshared embedding space and predict similarities between them. Despite their\nsuccess, it is, however, not understood how these models compare their two\ninputs. Common first-order feature-attribution methods can only provide limited\ninsights into dual-encoders since their predictions depend on\nfeature-interactions rather than on individual features. In this paper, we\nfirst derive a second-order method enabling the attribution of predictions by\nany differentiable dual encoder onto feature-interactions between its inputs.\nSecond, we apply our method to CLIP models and show that they learn\nfine-grained correspondences between parts of captions and regions in images.\nThey match objects across input modes also account for mismatches. This\nvisual-linguistic grounding ability, however, varies heavily between object\nclasses and exhibits pronounced out-of-domain effects. We can identify\nindividual errors as well as systematic failure categories including object\ncoverage, unusual scenes and correlated contexts.\n","authors":["Lucas Möller","Pascal Tilli","Ngoc Thang Vu","Sebastian Padó"],"pdf_url":"https://arxiv.org/pdf/2408.14153v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04219v1","updated":"2025-03-06T08:54:31Z","published":"2025-03-06T08:54:31Z","title":"Quantum-Inspired Reinforcement Learning in the Presence of Epistemic\n Ambivalence","summary":" The complexity of online decision-making under uncertainty stems from the\nrequirement of finding a balance between exploiting known strategies and\nexploring new possibilities. Naturally, the uncertainty type plays a crucial\nrole in developing decision-making strategies that manage complexity\neffectively. In this paper, we focus on a specific form of uncertainty known as\nepistemic ambivalence (EA), which emerges from conflicting pieces of evidence\nor contradictory experiences. It creates a delicate interplay between\nuncertainty and confidence, distinguishing it from epistemic uncertainty that\ntypically diminishes with new information. Indeed, ambivalence can persist even\nafter additional knowledge is acquired. To address this phenomenon, we propose\na novel framework, called the epistemically ambivalent Markov decision process\n(EA-MDP), aiming to understand and control EA in decision-making processes.\nThis framework incorporates the concept of a quantum state from the quantum\nmechanics formalism, and its core is to assess the probability and reward of\nevery possible outcome. We calculate the reward function using quantum\nmeasurement techniques and prove the existence of an optimal policy and an\noptimal value function in the EA-MDP framework. We also propose the\nEA-epsilon-greedy Q-learning algorithm. To evaluate the impact of EA on\ndecision-making and the expedience of our framework, we study two distinct\nexperimental setups, namely the two-state problem and the lattice problem. Our\nresults show that using our methods, the agent converges to the optimal policy\nin the presence of EA.\n","authors":["Alireza Habibi","Saeed Ghoorchian","Setareh Maghsudi"],"pdf_url":"https://arxiv.org/pdf/2503.04219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.02495v2","updated":"2025-03-06T08:51:47Z","published":"2025-03-04T11:01:25Z","title":"Union of Experts: Adapting Hierarchical Routing to Equivalently\n Decomposed Transformer","summary":" We propose Union-of-Experts (UoE), which decomposes transformer into an\nequitant group of experts, and then implement selective routing on input data\nand experts. Our approach advances MoE design with four key innovations: (1) We\nconducted equitant expert decomposition on both MLP blocks and attention blocks\nbased on matrix partition in tensor parallelism. (2) We developed two routing\nparadigms: patch-wise data selection and expert selection, to apply routing\nacross different levels. (3) We design the architecture of UoE model, including\nSelective Multi-Head Attention (SMHA) and Union-of-MLP-Experts (UoME). (4) We\ndevelop parallel implementation of UoE's routing and computation operation, and\noptimize efficiency based on the hardware processing analysis. The experiments\ndemonstrate that the UoE model surpass Full Attention, state-of-art MoEs and\nefficient transformers (including the model architecture of recently proposed\nDeepSeek-V3) in several tasks across image and natural language domains. In\nlanguage modeling tasks, we achieve an average reduction of 2.38 in perplexity\ncompared to the best-performed MoE method with an average of 76% FLOPs. In Long\nRange Arena benchmark, we recorded an average score that is at least 0.68%\nhigher than all comparison models including Full Attention, MoEs, and\ntransformer variants, with only 50% FLOPs of the best MoE method. In image\nclassification, our model yielded an average accuracy improvement of 1.75% than\nthe best model while maintaining comparable FLOPs. The source codes are\navailable at https://github.com/YujiaoYang-work/UoE.\n","authors":["Yujiao Yang","Jing Lian","Linhui Li"],"pdf_url":"https://arxiv.org/pdf/2503.02495v2.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2410.07009v2","updated":"2025-03-06T08:51:05Z","published":"2024-10-09T15:52:48Z","title":"Pap2Pat: Benchmarking Outline-Guided Long-Text Patent Generation with\n Patent-Paper Pairs","summary":" Dealing with long and highly complex technical text is a challenge for Large\nLanguage Models (LLMs), which still have to unfold their potential in\nsupporting expensive and timeintensive processes like patent drafting. Within\npatents, the description constitutes more than 90% of the document on average.\nYet, its automatic generation remains understudied. When drafting patent\napplications, patent attorneys typically receive invention reports (IRs), which\nare usually confidential, hindering research on LLM-supported patent drafting.\nOften, prepublication research papers serve as IRs. We leverage this duality to\nbuild PAP2PAT, an open and realistic benchmark for patent drafting consisting\nof 1.8k patent-paper pairs describing the same inventions. To address the\ncomplex longdocument patent generation task, we propose chunk-based\noutline-guided generation using the research paper as invention specification.\nOur extensive evaluation using PAP2PAT and a human case study show that LLMs\ncan effectively leverage information from the paper, but still struggle to\nprovide the necessary level of detail. Fine-tuning leads to more patent-style\nlanguage, but also to more hallucination. We release our data and code\nhttps://github.com/boschresearch/Pap2Pat.\n","authors":["Valentin Knappich","Simon Razniewski","Anna Hätty","Annemarie Friedrich"],"pdf_url":"https://arxiv.org/pdf/2410.07009v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04201v1","updated":"2025-03-06T08:28:44Z","published":"2025-03-06T08:28:44Z","title":"Knowledge-Decoupled Synergetic Learning: An MLLM based Collaborative\n Approach to Few-shot Multimodal Dialogue Intention Recognition","summary":" Few-shot multimodal dialogue intention recognition is a critical challenge in\nthe e-commerce domainn. Previous methods have primarily enhanced model\nclassification capabilities through post-training techniques. However, our\nanalysis reveals that training for few-shot multimodal dialogue intention\nrecognition involves two interconnected tasks, leading to a seesaw effect in\nmulti-task learning. This phenomenon is attributed to knowledge interference\nstemming from the superposition of weight matrix updates during the training\nprocess. To address these challenges, we propose Knowledge-Decoupled Synergetic\nLearning (KDSL), which mitigates these issues by utilizing smaller models to\ntransform knowledge into interpretable rules, while applying the post-training\nof larger models. By facilitating collaboration between the large and small\nmultimodal large language models for prediction, our approach demonstrates\nsignificant improvements. Notably, we achieve outstanding results on two real\nTaobao datasets, with enhancements of 6.37\\% and 6.28\\% in online weighted F1\nscores compared to the state-of-the-art method, thereby validating the efficacy\nof our framework.\n","authors":["Bin Chen","Yu Zhang","Hongfei Ye","Ziyi Huang","Hongyang Chen"],"pdf_url":"https://arxiv.org/pdf/2503.04201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04199v1","updated":"2025-03-06T08:27:51Z","published":"2025-03-06T08:27:51Z","title":"MASTER: Multimodal Segmentation with Text Prompts","summary":" RGB-Thermal fusion is a potential solution for various weather and light\nconditions in challenging scenarios. However, plenty of studies focus on\ndesigning complex modules to fuse different modalities. With the widespread\napplication of large language models (LLMs), valuable information can be more\neffectively extracted from natural language. Therefore, we aim to leverage the\nadvantages of large language models to design a structurally simple and highly\nadaptable multimodal fusion model architecture. We proposed MultimodAl\nSegmentation with TExt PRompts (MASTER) architecture, which integrates LLM into\nthe fusion of RGB-Thermal multimodal data and allows complex query text to\nparticipate in the fusion process. Our model utilizes a dual-path structure to\nextract information from different modalities of images. Additionally, we\nemploy LLM as the core module for multimodal fusion, enabling the model to\ngenerate learnable codebook tokens from RGB, thermal images, and textual\ninformation. A lightweight image decoder is used to obtain semantic\nsegmentation results. The proposed MASTER performs exceptionally well in\nbenchmark tests across various automated driving scenarios, yielding promising\nresults.\n","authors":["Fuyang Liu","Shun Lu","Jilin Mei","Yu Hu"],"pdf_url":"https://arxiv.org/pdf/2503.04199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12106v3","updated":"2025-03-06T08:18:50Z","published":"2024-09-18T16:26:22Z","title":"Measuring Human and AI Values Based on Generative Psychometrics with\n Large Language Models","summary":" Human values and their measurement are long-standing interdisciplinary\ninquiry. Recent advances in AI have sparked renewed interest in this area, with\nlarge language models (LLMs) emerging as both tools and subjects of value\nmeasurement. This work introduces Generative Psychometrics for Values (GPV), an\nLLM-based, data-driven value measurement paradigm, theoretically grounded in\ntext-revealed selective perceptions. The core idea is to dynamically parse\nunstructured texts into perceptions akin to static stimuli in traditional\npsychometrics, measure the value orientations they reveal, and aggregate the\nresults. Applying GPV to human-authored blogs, we demonstrate its stability,\nvalidity, and superiority over prior psychological tools. Then, extending GPV\nto LLM value measurement, we advance the current art with 1) a psychometric\nmethodology that measures LLM values based on their scalable and free-form\noutputs, enabling context-specific measurement; 2) a comparative analysis of\nmeasurement paradigms, indicating response biases of prior methods; and 3) an\nattempt to bridge LLM values and their safety, revealing the predictive power\nof different value systems and the impacts of various values on LLM safety.\nThrough interdisciplinary efforts, we aim to leverage AI for next-generation\npsychometrics and psychometrics for value-aligned AI.\n","authors":["Haoran Ye","Yuhang Xie","Yuanyi Ren","Hanjun Fang","Xin Zhang","Guojie Song"],"pdf_url":"https://arxiv.org/pdf/2409.12106v3.pdf","comment":"Accepted at AAAI 2025"},{"id":"http://arxiv.org/abs/2503.04184v1","updated":"2025-03-06T07:53:24Z","published":"2025-03-06T07:53:24Z","title":"Large-Scale AI in Telecom: Charting the Roadmap for Innovation,\n Scalability, and Enhanced Digital Experiences","summary":" This white paper discusses the role of large-scale AI in the\ntelecommunications industry, with a specific focus on the potential of\ngenerative AI to revolutionize network functions and user experiences,\nespecially in the context of 6G systems. It highlights the development and\ndeployment of Large Telecom Models (LTMs), which are tailored AI models\ndesigned to address the complex challenges faced by modern telecom networks.\nThe paper covers a wide range of topics, from the architecture and deployment\nstrategies of LTMs to their applications in network management, resource\nallocation, and optimization. It also explores the regulatory, ethical, and\nstandardization considerations for LTMs, offering insights into their future\nintegration into telecom infrastructure. The goal is to provide a comprehensive\nroadmap for the adoption of LTMs to enhance scalability, performance, and\nuser-centric innovation in telecom networks.\n","authors":["Adnan Shahid","Adrian Kliks","Ahmed Al-Tahmeesschi","Ahmed Elbakary","Alexandros Nikou","Ali Maatouk","Ali Mokh","Amirreza Kazemi","Antonio De Domenico","Athanasios Karapantelakis","Bo Cheng","Bo Yang","Bohao Wang","Carlo Fischione","Chao Zhang","Chaouki Ben Issaid","Chau Yuen","Chenghui Peng","Chongwen Huang","Christina Chaccour","Christo Kurisummoottil Thomas","Dheeraj Sharma","Dimitris Kalogiros","Dusit Niyato","Eli De Poorter","Elissa Mhanna","Emilio Calvanese Strinati","Faouzi Bader","Fathi Abdeldayem","Fei Wang","Fenghao Zhu","Gianluca Fontanesi","Giovanni Geraci","Haibo Zhou","Hakimeh Purmehdi","Hamed Ahmadi","Hang Zou","Hongyang Du","Hoon Lee","Howard H. Yang","Iacopo Poli","Igor Carron","Ilias Chatzistefanidis","Inkyu Lee","Ioannis Pitsiorlas","Jaron Fontaine","Jiajun Wu","Jie Zeng","Jinan Li","Jinane Karam","Johny Gemayel","Juan Deng","Julien Frison","Kaibin Huang","Kehai Qiu","Keith Ball","Kezhi Wang","Kun Guo","Leandros Tassiulas","Lecorve Gwenole","Liexiang Yue","Lina Bariah","Louis Powell","Marcin Dryjanski","Maria Amparo Canaveras Galdon","Marios Kountouris","Maryam Hafeez","Maxime Elkael","Mehdi Bennis","Mehdi Boudjelli","Meiling Dai","Merouane Debbah","Michele Polese","Mohamad Assaad","Mohamed Benzaghta","Mohammad Al Refai","Moussab Djerrab","Mubeen Syed","Muhammad Amir","Na Yan","Najla Alkaabi","Nan Li","Nassim Sehad","Navid Nikaein","Omar Hashash","Pawel Sroka","Qianqian Yang","Qiyang Zhao","Rasoul Nikbakht Silab","Rex Ying","Roberto Morabito","Rongpeng Li","Ryad Madi","Salah Eddine El Ayoubi","Salvatore D'Oro","Samson Lasaulce","Serveh Shalmashi","Sige Liu","Sihem Cherrared","Swarna Bindu Chetty","Swastika Dutta","Syed A. R. Zaidi","Tianjiao Chen","Timothy Murphy","Tommaso Melodia","Tony Q. S. Quek","Vishnu Ram","Walid Saad","Wassim Hamidouche","Weilong Chen","Xiaoou Liu","Xiaoxue Yu","Xijun Wang","Xingyu Shang","Xinquan Wang","Xuelin Cao","Yang Su","Yanping Liang","Yansha Deng","Yifan Yang","Yingping Cui","Yu Sun","Yuxuan Chen","Yvan Pointurier","Zeinab Nehme","Zeinab Nezami","Zhaohui Yang","Zhaoyang Zhang","Zhe Liu","Zhenyu Yang","Zhu Han","Zhuang Zhou","Zihan Chen","Zirui Chen","Zitao Shuai"],"pdf_url":"https://arxiv.org/pdf/2503.04184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04183v1","updated":"2025-03-06T07:52:20Z","published":"2025-03-06T07:52:20Z","title":"CrowdHMTware: A Cross-level Co-adaptation Middleware for Context-aware\n Mobile DL Deployment","summary":" There are many deep learning (DL) powered mobile and wearable applications\ntoday continuously and unobtrusively sensing the ambient surroundings to\nenhance all aspects of human lives.To enable robust and private mobile sensing,\nDL models are often deployed locally on resource-constrained mobile devices\nusing techniques such as model compression or offloading.However, existing\nmethods, either front-end algorithm level (i.e. DL model\ncompression/partitioning) or back-end scheduling level (i.e. operator/resource\nscheduling), cannot be locally online because they require offline retraining\nto ensure accuracy or rely on manually pre-defined strategies, struggle with\ndynamic adaptability.The primary challenge lies in feeding back runtime\nperformance from the back-end level to the front-end level optimization\ndecision. Moreover, the adaptive mobile DL model porting middleware with\ncross-level co-adaptation is less explored, particularly in mobile environments\nwith diversity and dynamics. In response, we introduce CrowdHMTware, a dynamic\ncontext-adaptive DL model deployment middleware for heterogeneous mobile\ndevices. It establishes an automated adaptation loop between cross-level\nfunctional components, i.e. elastic inference, scalable offloading, and\nmodel-adaptive engine, enhancing scalability and adaptability. Experiments with\nfour typical tasks across 15 platforms and a real-world case study demonstrate\nthat CrowdHMTware can effectively scale DL model, offloading, and engine\nactions across diverse platforms and tasks. It hides run-time system issues\nfrom developers, reducing the required developer expertise.\n","authors":["Sicong Liu","Bin Guo","Shiyan Luo","Yuzhan Wang","Hao Luo","Cheng Fang","Yuan Xu","Ke Ma","Yao Li","Zhiwen Yu"],"pdf_url":"https://arxiv.org/pdf/2503.04183v1.pdf","comment":"This paper is accepted by IEEE Transactions on Mobile Computing"},{"id":"http://arxiv.org/abs/2411.11006v2","updated":"2025-03-06T07:50:21Z","published":"2024-11-17T09:01:55Z","title":"BackdoorMBTI: A Backdoor Learning Multimodal Benchmark Tool Kit for\n Backdoor Defense Evaluation","summary":" Over the past few years, the emergence of backdoor attacks has presented\nsignificant challenges to deep learning systems, allowing attackers to insert\nbackdoors into neural networks. When data with a trigger is processed by a\nbackdoor model, it can lead to mispredictions targeted by attackers, whereas\nnormal data yields regular results. The scope of backdoor attacks is expanding\nbeyond computer vision and encroaching into areas such as natural language\nprocessing and speech recognition. Nevertheless, existing backdoor defense\nmethods are typically tailored to specific data modalities, restricting their\napplication in multimodal contexts. While multimodal learning proves highly\napplicable in facial recognition, sentiment analysis, action recognition,\nvisual question answering, the security of these models remains a crucial\nconcern. Specifically, there are no existing backdoor benchmarks targeting\nmultimodal applications or related tasks.\n In order to facilitate the research in multimodal backdoor, we introduce\nBackdoorMBTI, the first backdoor learning toolkit and benchmark designed for\nmultimodal evaluation across three representative modalities from eleven\ncommonly used datasets. BackdoorMBTI provides a systematic backdoor learning\npipeline, encompassing data processing, data poisoning, backdoor training, and\nevaluation. The generated poison datasets and backdoor models enable detailed\nevaluation of backdoor defenses. Given the diversity of modalities,\nBackdoorMBTI facilitates systematic evaluation across different data types.\nFurthermore, BackdoorMBTI offers a standardized approach to handling practical\nfactors in backdoor learning, such as issues related to data quality and\nerroneous labels. We anticipate that BackdoorMBTI will expedite future research\nin backdoor defense methods within a multimodal context. Code is available at\nhttps://github.com/SJTUHaiyangYu/BackdoorMBTI.\n","authors":["Haiyang Yu","Tian Xie","Jiaping Gui","Pengyang Wang","Ping Yi","Yue Wu"],"pdf_url":"https://arxiv.org/pdf/2411.11006v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04176v1","updated":"2025-03-06T07:44:17Z","published":"2025-03-06T07:44:17Z","title":"TIMER: Temporal Instruction Modeling and Evaluation for Longitudinal\n Clinical Records","summary":" Large language models (LLMs) have emerged as promising tools for assisting in\nmedical tasks, yet processing Electronic Health Records (EHRs) presents unique\nchallenges due to their longitudinal nature. While LLMs' capabilities to\nperform medical tasks continue to improve, their ability to reason over\ntemporal dependencies across multiple patient visits and time frames remains\nunexplored. We introduce TIMER (Temporal Instruction Modeling and Evaluation\nfor Longitudinal Clinical Records), a framework that incorporate\ninstruction-response pairs grounding to different parts of a patient's record\nas a critical dimension in both instruction evaluation and tuning for\nlongitudinal clinical records. We develop TIMER-Bench, the first time-aware\nbenchmark that evaluates temporal reasoning capabilities over longitudinal\nEHRs, as well as TIMER-Instruct, an instruction-tuning methodology for LLMs to\nlearn reasoning over time. We demonstrate that models fine-tuned with\nTIMER-Instruct improve performance by 7.3% on human-generated benchmarks and\n9.2% on TIMER-Bench, indicating that temporal instruction-tuning improves model\nperformance for reasoning over EHR.\n","authors":["Hejie Cui","Alyssa Unell","Bowen Chen","Jason Alan Fries","Emily Alsentzer","Sanmi Koyejo","Nigam Shah"],"pdf_url":"https://arxiv.org/pdf/2503.04176v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2410.06665v3","updated":"2025-03-06T07:41:25Z","published":"2024-10-09T08:19:31Z","title":"Revisiting Multi-Permutation Equivariance through the Lens of\n Irreducible Representations","summary":" This paper explores the characterization of equivariant linear layers for\nrepresentations of permutations and related groups. Unlike traditional\napproaches, which address these problems using parameter-sharing, we consider\nan alternative methodology based on irreducible representations and Schur's\nlemma. Using this methodology, we obtain an alternative derivation for existing\nmodels like DeepSets, 2-IGN graph equivariant networks, and Deep Weight Space\n(DWS) networks. The derivation for DWS networks is significantly simpler than\nthat of previous results.\n Next, we extend our approach to unaligned symmetric sets, where equivariance\nto the wreath product of groups is required. Previous works have addressed this\nproblem in a rather restrictive setting, in which almost all wreath equivariant\nlayers are Siamese. In contrast, we give a full characterization of layers in\nthis case and show that there is a vast number of additional non-Siamese layers\nin some settings. We also show empirically that these additional non-Siamese\nlayers can improve performance in tasks like graph anomaly detection, weight\nspace alignment, and learning Wasserstein distances. Our code is available at\n\\href{https://github.com/yonatansverdlov/Irreducible-Representations-of-Deep-Weight-Spaces}{GitHub}.\n","authors":["Yonatan Sverdlov","Ido Springer","Nadav Dym"],"pdf_url":"https://arxiv.org/pdf/2410.06665v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04170v1","updated":"2025-03-06T07:36:06Z","published":"2025-03-06T07:36:06Z","title":"Towards Intelligent Transportation with Pedestrians and Vehicles\n In-the-Loop: A Surveillance Video-Assisted Federated Digital Twin Framework","summary":" In intelligent transportation systems (ITSs), incorporating pedestrians and\nvehicles in-the-loop is crucial for developing realistic and safe traffic\nmanagement solutions. However, there is falls short of simulating complex\nreal-world ITS scenarios, primarily due to the lack of a digital twin\nimplementation framework for characterizing interactions between pedestrians\nand vehicles at different locations in different traffic environments. In this\narticle, we propose a surveillance video assisted federated digital twin\n(SV-FDT) framework to empower ITSs with pedestrians and vehicles in-the-loop.\nSpecifically, SVFDT builds comprehensive pedestrian-vehicle interaction models\nby leveraging multi-source traffic surveillance videos. Its architecture\nconsists of three layers: (i) the end layer, which collects traffic\nsurveillance videos from multiple sources; (ii) the edge layer, responsible for\nsemantic segmentation-based visual understanding, twin agent-based interaction\nmodeling, and local digital twin system (LDTS) creation in local regions; and\n(iii) the cloud layer, which integrates LDTSs across different regions to\nconstruct a global DT model in realtime. We analyze key design requirements and\nchallenges and present core guidelines for SVFDT's system implementation. A\ntestbed evaluation demonstrates its effectiveness in optimizing traffic\nmanagement. Comparisons with traditional terminal-server frameworks highlight\nSV-FDT's advantages in mirroring delays, recognition accuracy, and subjective\nevaluation. Finally, we identify some open challenges and discuss future\nresearch directions.\n","authors":["Xiaolong Li","Jianhao Wei","Haidong Wang","Li Dong","Ruoyang Chen","Changyan Yi","Jun Cai","Dusit Niyato"," Xuemin"," Shen"],"pdf_url":"https://arxiv.org/pdf/2503.04170v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02318v3","updated":"2025-03-06T07:29:44Z","published":"2024-04-18T00:20:48Z","title":"Autoformalizing Natural Language to First-Order Logic: A Case Study in\n Logical Fallacy Detection","summary":" Translating natural language into formal language such as First-Order Logic\n(FOL) is a foundational challenge in NLP with wide-ranging applications in\nautomated reasoning, misinformation tracking, and knowledge validation. In this\npaper, we introduce Natural Language to First-Order Logic (NL2FOL), a framework\nto autoformalize natural language to FOL step by step using Large Language\nModels (LLMs). Our approach addresses key challenges in this translation\nprocess, including the integration of implicit background knowledge. By\nleveraging structured representations generated by NL2FOL, we use\nSatisfiability Modulo Theory (SMT) solvers to reason about the logical validity\nof natural language statements. We present logical fallacy detection as a case\nstudy to evaluate the efficacy of NL2FOL. Being neurosymbolic, our approach\nalso provides interpretable insights into the reasoning process and\ndemonstrates robustness without requiring model fine-tuning or labeled training\ndata. Our framework achieves strong performance on multiple datasets. On the\nLOGIC dataset, NL2FOL achieves an F1-score of 78%, while generalizing\neffectively to the LOGICCLIMATE dataset with an F1-score of 80%.\n","authors":["Abhinav Lalwani","Tasha Kim","Lovish Chopra","Christopher Hahn","Zhijing Jin","Mrinmaya Sachan"],"pdf_url":"https://arxiv.org/pdf/2405.02318v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04167v1","updated":"2025-03-06T07:29:33Z","published":"2025-03-06T07:29:33Z","title":"The Role of Visual Modality in Multimodal Mathematical Reasoning:\n Challenges and Insights","summary":" Recent research has increasingly focused on multimodal mathematical\nreasoning, particularly emphasizing the creation of relevant datasets and\nbenchmarks. Despite this, the role of visual information in reasoning has been\nunderexplored. Our findings show that existing multimodal mathematical models\nminimally leverage visual information, and model performance remains largely\nunaffected by changes to or removal of images in the dataset. We attribute this\nto the dominance of textual information and answer options that inadvertently\nguide the model to correct answers. To improve evaluation methods, we introduce\nthe HC-M3D dataset, specifically designed to require image reliance for\nproblem-solving and to challenge models with similar, yet distinct, images that\nchange the correct answer. In testing leading models, their failure to detect\nthese subtle visual differences suggests limitations in current visual\nperception capabilities. Additionally, we observe that the common approach of\nimproving general VQA capabilities by combining various types of image encoders\ndoes not contribute to math reasoning performance. This finding also presents a\nchallenge to enhancing visual reliance during math reasoning. Our benchmark and\ncode would be available at\n\\href{https://github.com/Yufang-Liu/visual_modality_role}{https://github.com/Yufang-Liu/visual\\_modality\\_role}.\n","authors":["Yufang Liu","Yao Du","Tao Ji","Jianing Wang","Yang Liu","Yuanbin Wu","Aimin Zhou","Mengdi Zhang","Xunliang Cai"],"pdf_url":"https://arxiv.org/pdf/2503.04167v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04162v1","updated":"2025-03-06T07:25:19Z","published":"2025-03-06T07:25:19Z","title":"Semantic Retrieval Augmented Contrastive Learning for Sequential\n Recommendation","summary":" Sequential recommendation aims to model user preferences based on historical\nbehavior sequences, which is crucial for various online platforms. Data\nsparsity remains a significant challenge in this area as most users have\nlimited interactions and many items receive little attention. To mitigate this\nissue, contrastive learning has been widely adopted. By constructing positive\nsample pairs from the data itself and maximizing their agreement in the\nembedding space,it can leverage available data more effectively. Constructing\nreasonable positive sample pairs is crucial for the success of contrastive\nlearning. However, current approaches struggle to generate reliable positive\npairs as they either rely on representations learned from inherently sparse\ncollaborative signals or use random perturbations which introduce significant\nuncertainty. To address these limitations, we propose a novel approach named\nSemantic Retrieval Augmented Contrastive Learning (SRA-CL), which leverages\nsemantic information to improve the reliability of contrastive samples. SRA-CL\ncomprises two main components: (1) Cross-Sequence Contrastive Learning via User\nSemantic Retrieval, which utilizes large language models (LLMs) to understand\ndiverse user preferences and retrieve semantically similar users to form\nreliable positive samples through a learnable sample synthesis method; and (2)\nIntra-Sequence Contrastive Learning via Item Semantic Retrieval, which employs\nLLMs to comprehend items and retrieve similar items to perform semantic-based\nitem substitution, thereby creating semantically consistent augmented views for\ncontrastive learning. SRA-CL is plug-and-play and can be integrated into\nstandard sequential recommendation models. Extensive experiments on four public\ndatasets demonstrate the effectiveness and generalizability of the proposed\napproach.\n","authors":["Ziqiang Cui","Yunpeng Weng","Xing Tang","Xiaokun Zhang","Dugang Liu","Shiwei Li","Peiyang Liu","Bowei He","Weihong Luo","Xiuqiang He","Chen Ma"],"pdf_url":"https://arxiv.org/pdf/2503.04162v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04160v1","updated":"2025-03-06T07:23:44Z","published":"2025-03-06T07:23:44Z","title":"Unseen Fake News Detection Through Casual Debiasing","summary":" The widespread dissemination of fake news on social media poses significant\nrisks, necessitating timely and accurate detection. However, existing methods\nstruggle with unseen news due to their reliance on training data from past\nevents and domains, leaving the challenge of detecting novel fake news largely\nunresolved. To address this, we identify biases in training data tied to\nspecific domains and propose a debiasing solution FNDCD. Originating from\ncausal analysis, FNDCD employs a reweighting strategy based on classification\nconfidence and propagation structure regularization to reduce the influence of\ndomain-specific biases, enhancing the detection of unseen fake news.\nExperiments on real-world datasets with non-overlapping news domains\ndemonstrate FNDCD's effectiveness in improving generalization across domains.\n","authors":["Shuzhi Gong","Richard Sinnott","Jianzhong Qi","Cecile Paris"],"pdf_url":"https://arxiv.org/pdf/2503.04160v1.pdf","comment":"2025 The Web Conference, 6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2502.13681v2","updated":"2025-03-06T07:17:09Z","published":"2025-02-19T12:51:35Z","title":"An LLM-based Agent for Reliable Docker Environment Configuration","summary":" Environment configuration is a critical yet time-consuming step in software\ndevelopment, especially when dealing with unfamiliar code repositories. While\nLarge Language Models (LLMs) demonstrate the potential to accomplish software\nengineering tasks, existing methods for environment configuration often rely on\nmanual efforts or fragile scripts, leading to inefficiencies and unreliable\noutcomes. We introduce Repo2Run, the first LLM-based agent designed to fully\nautomate environment configuration and generate executable Dockerfiles for\narbitrary Python repositories. We address two major challenges: (1) enabling\nthe LLM agent to configure environments within isolated Docker containers, and\n(2) ensuring the successful configuration process is recorded and accurately\ntransferred to a Dockerfile without error. To achieve this, we propose atomic\nconfiguration synthesis, featuring a dual-environment architecture (internal\nand external environment) with a rollback mechanism to prevent environment\n\"pollution\" from failed commands, guaranteeing atomic execution (execute fully\nor not at all) and a Dockerfile generator to transfer successful configuration\nsteps into runnable Dockerfiles. We evaluate Repo2Run~on our proposed benchmark\nof 420 recent Python repositories with unit tests, where it achieves an 86.0%\nsuccess rate, outperforming the best baseline by 63.9%. Repo2Run is available\nat https://github.com/bytedance/Repo2Run.\n","authors":["Ruida Hu","Chao Peng","Xinchen Wang","Cuiyun Gao"],"pdf_url":"https://arxiv.org/pdf/2502.13681v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04154v1","updated":"2025-03-06T07:02:13Z","published":"2025-03-06T07:02:13Z","title":"CA-W3D: Leveraging Context-Aware Knowledge for Weakly Supervised\n Monocular 3D Detection","summary":" Weakly supervised monocular 3D detection, while less annotation-intensive,\noften struggles to capture the global context required for reliable 3D\nreasoning. Conventional label-efficient methods focus on object-centric\nfeatures, neglecting contextual semantic relationships that are critical in\ncomplex scenes. In this work, we propose a Context-Aware Weak Supervision for\nMonocular 3D object detection, namely CA-W3D, to address this limitation in a\ntwo-stage training paradigm. Specifically, we first introduce a pre-training\nstage employing Region-wise Object Contrastive Matching (ROCM), which aligns\nregional object embeddings derived from a trainable monocular 3D encoder and a\nfrozen open-vocabulary 2D visual grounding model. This alignment encourages the\nmonocular encoder to discriminate scene-specific attributes and acquire richer\ncontextual knowledge. In the second stage, we incorporate a pseudo-label\ntraining process with a Dual-to-One Distillation (D2OD) mechanism, which\neffectively transfers contextual priors into the monocular encoder while\npreserving spatial fidelity and maintaining computational efficiency during\ninference. Extensive experiments conducted on the public KITTI benchmark\ndemonstrate the effectiveness of our approach, surpassing the SoTA method over\nall metrics, highlighting the importance of contextual-aware knowledge in\nweakly-supervised monocular 3D detection.\n","authors":["Chupeng Liu","Runkai Zhao","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2503.04154v1.pdf","comment":"The paper includes 8 pages, 6 figures and 4 tables"},{"id":"http://arxiv.org/abs/2503.04153v1","updated":"2025-03-06T07:01:36Z","published":"2025-03-06T07:01:36Z","title":"KidneyTalk-open: No-code Deployment of a Private Large Language Model\n with Medical Documentation-Enhanced Knowledge Database for Kidney Disease","summary":" Privacy-preserving medical decision support for kidney disease requires\nlocalized deployment of large language models (LLMs) while maintaining clinical\nreasoning capabilities. Current solutions face three challenges: 1) Cloud-based\nLLMs pose data security risks; 2) Local model deployment demands technical\nexpertise; 3) General LLMs lack mechanisms to integrate medical knowledge.\nRetrieval-augmented systems also struggle with medical document processing and\nclinical usability. We developed KidneyTalk-open, a desktop system integrating\nthree technical components: 1) No-code deployment of state-of-the-art (SOTA)\nopen-source LLMs (such as DeepSeek-r1, Qwen2.5) via local inference engine; 2)\nMedical document processing pipeline combining context-aware chunking and\nintelligent filtering; 3) Adaptive Retrieval and Augmentation Pipeline (AddRep)\nemploying agents collaboration for improving the recall rate of medical\ndocuments. A graphical interface was designed to enable clinicians to manage\nmedical documents and conduct AI-powered consultations without technical\nexpertise. Experimental validation on 1,455 challenging nephrology exam\nquestions demonstrates AddRep's effectiveness: achieving 29.1% accuracy (+8.1%\nover baseline) with intelligent knowledge integration, while maintaining\nrobustness through 4.9% rejection rate to suppress hallucinations. Comparative\ncase studies with the mainstream products (AnythingLLM, Chatbox, GPT4ALL)\ndemonstrate KidneyTalk-open's superior performance in real clinical query.\nKidneyTalk-open represents the first no-code medical LLM system enabling secure\ndocumentation-enhanced medical Q&A on desktop. Its designs establishes a new\nframework for privacy-sensitive clinical AI applications. The system\nsignificantly lowers technical barriers while improving evidence traceability,\nenabling more medical staff or patients to use SOTA open-source LLMs\nconveniently.\n","authors":["Yongchao Long","Chao Yang","Gongzheng Tang","Jinwei Wang","Zhun Sui","Yuxi Zhou","Shenda Hong","Luxia Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.04153v1.pdf","comment":"Corresponding authors: zhanglx@bjmu.edu.cn; joy_yuxi@pku.edu.cn;\n hongshenda@pku.edu.cn"},{"id":"http://arxiv.org/abs/2503.04151v1","updated":"2025-03-06T07:01:08Z","published":"2025-03-06T07:01:08Z","title":"Robust Multi-View Learning via Representation Fusion of Sample-Level\n Attention and Alignment of Simulated Perturbation","summary":" Recently, multi-view learning (MVL) has garnered significant attention due to\nits ability to fuse discriminative information from multiple views. However,\nreal-world multi-view datasets are often heterogeneous and imperfect, which\nusually makes MVL methods designed for specific combinations of views lack\napplication potential and limits their effectiveness. To address this issue, we\npropose a novel robust MVL method (namely RML) with simultaneous representation\nfusion and alignment. Specifically, we introduce a simple yet effective\nmulti-view transformer fusion network where we transform heterogeneous\nmulti-view data into homogeneous word embeddings, and then integrate multiple\nviews by the sample-level attention mechanism to obtain a fused representation.\nFurthermore, we propose a simulated perturbation based multi-view contrastive\nlearning framework that dynamically generates the noise and unusable\nperturbations for simulating imperfect data conditions. The simulated noisy and\nunusable data obtain two distinct fused representations, and we utilize\ncontrastive learning to align them for learning discriminative and robust\nrepresentations. Our RML is self-supervised and can also be applied for\ndownstream tasks as a regularization. In experiments, we employ it in\nunsupervised multi-view clustering, noise-label classification, and as a\nplug-and-play module for cross-modal hashing retrieval. Extensive comparison\nexperiments and ablation studies validate the effectiveness of RML.\n","authors":["Jie Xu","Na Zhao","Gang Niu","Masashi Sugiyama","Xiaofeng Zhu"],"pdf_url":"https://arxiv.org/pdf/2503.04151v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04150v1","updated":"2025-03-06T06:59:09Z","published":"2025-03-06T06:59:09Z","title":"Ticktack : Long Span Temporal Alignment of Large Language Models\n Leveraging Sexagenary Cycle Time Expression","summary":" Large language models (LLMs) suffer from temporal misalignment issues\nespecially across long span of time. The issue arises from knowing that LLMs\nare trained on large amounts of data where temporal information is rather\nsparse over long times, such as thousands of years, resulting in insufficient\nlearning or catastrophic forgetting by the LLMs. This paper proposes a\nmethodology named \"Ticktack\" for addressing the LLM's long-time span\nmisalignment in a yearly setting. Specifically, we first propose to utilize the\nsexagenary year expression instead of the Gregorian year expression employed by\nLLMs, achieving a more uniform distribution in yearly granularity. Then, we\nemploy polar coordinates to model the sexagenary cycle of 60 terms and the year\norder within each term, with additional temporal encoding to ensure LLMs\nunderstand them. Finally, we present a temporal representational alignment\napproach for post-training LLMs that effectively distinguishes time points with\nrelevant knowledge, hence improving performance on time-related tasks,\nparticularly over a long period. We also create a long time span benchmark for\nevaluation. Experimental results prove the effectiveness of our proposal.\n","authors":["Xue Han","Qian Hu","Yitong Wang","Wenchun Gao","Lianlian Zhang","Qing Wang","Lijun Mei","Chao Deng","Junlan Feng"],"pdf_url":"https://arxiv.org/pdf/2503.04150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04149v1","updated":"2025-03-06T06:56:59Z","published":"2025-03-06T06:56:59Z","title":"Dynamic Benchmarking of Reasoning Capabilities in Code Large Language\n Models Under Data Contamination","summary":" The rapid evolution of code largelanguage models underscores the need for\neffective and transparent benchmarking of their reasoning capabilities.\nHowever, the current benchmarking approach heavily depends on publicly\navailable, human-created datasets. The widespread use of these fixed benchmark\ndatasets makes the benchmarking process to be static and thus particularly\nsusceptible to data contamination, an unavoidable consequence of the extensive\ndata collection processes used to train Code LLMs. Existing approaches that\naddress data contamination often suffer from human effort limitations and\nimbalanced problem complexity. To tackle these challenges, we propose \\tool, a\nnovel benchmarking suite for evaluating Code LLMs under potential data\ncontamination. Given a seed programming problem, \\tool employs multiple agents\nto extract and modify the context without altering the core logic, generating\nsemantically equivalent variations. We introduce a dynamic data generation\nmethods and conduct empirical studies on two seed datasets across 21 Code LLMs.\nResults show that \\tool effectively benchmarks reasoning capabilities under\ncontamination risks while generating diverse problem sets to ensure consistent\nand reliable evaluations.\n","authors":["Simin Chen","Pranav Pusarla","Baishakhi Ray"],"pdf_url":"https://arxiv.org/pdf/2503.04149v1.pdf","comment":"https://codekaleidoscope.github.io/dycodeeval.html"},{"id":"http://arxiv.org/abs/2502.12767v2","updated":"2025-03-06T06:41:40Z","published":"2025-02-18T11:31:52Z","title":"R2-KG: General-Purpose Dual-Agent Framework for Reliable Reasoning on\n Knowledge Graphs","summary":" Recent studies have combined Large Language Models (LLMs) with Knowledge\nGraphs (KGs) to enhance reasoning, improving inference accuracy without\nadditional training while mitigating hallucination. However, existing\nframeworks are often rigid, struggling to adapt to KG or task changes. They\nalso rely heavily on powerful LLMs for reliable (i.e., trustworthy) reasoning.\nTo address this, We introduce R2-KG, a plug-and-play, dual-agent framework that\nseparates reasoning into two roles: an Operator (a low-capacity LLM) that\ngathers evidence and a Supervisor (a high-capacity LLM) that makes final\njudgments. This design is cost-efficient for LLM inference while still\nmaintaining strong reasoning accuracy. Additionally, R2-KG employs an\nAbstention mechanism, generating answers only when sufficient evidence is\ncollected from KG, which significantly enhances reliability. Experiments across\nmultiple KG-based reasoning tasks show that R2-KG consistently outperforms\nbaselines in both accuracy and reliability, regardless of the inherent\ncapability of LLMs used as the Operator. Further experiments reveal that the\nsingle-agent version of R2-KG, equipped with a strict self-consistency\nstrategy, achieves significantly higher-than-baseline reliability while\nreducing inference cost. However, it also leads to a higher abstention rate in\ncomplex KGs. Our findings establish R2-KG as a flexible and cost-effective\nsolution for KG-based reasoning. It reduces reliance on high-capacity LLMs\nwhile ensuring trustworthy inference.\n","authors":["Sumin Jo","Junseong Choi","Jiho Kim","Edward Choi"],"pdf_url":"https://arxiv.org/pdf/2502.12767v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04144v1","updated":"2025-03-06T06:41:38Z","published":"2025-03-06T06:41:38Z","title":"DM-Adapter: Domain-Aware Mixture-of-Adapters for Text-Based Person\n Retrieval","summary":" Text-based person retrieval (TPR) has gained significant attention as a\nfine-grained and challenging task that closely aligns with practical\napplications. Tailoring CLIP to person domain is now a emerging research topic\ndue to the abundant knowledge of vision-language pretraining, but challenges\nstill remain during fine-tuning: (i) Previous full-model fine-tuning in TPR is\ncomputationally expensive and prone to overfitting.(ii) Existing\nparameter-efficient transfer learning (PETL) for TPR lacks of fine-grained\nfeature extraction. To address these issues, we propose Domain-Aware\nMixture-of-Adapters (DM-Adapter), which unifies Mixture-of-Experts (MOE) and\nPETL to enhance fine-grained feature representations while maintaining\nefficiency. Specifically, Sparse Mixture-of-Adapters is designed in parallel to\nMLP layers in both vision and language branches, where different experts\nspecialize in distinct aspects of person knowledge to handle features more\nfinely. To promote the router to exploit domain information effectively and\nalleviate the routing imbalance, Domain-Aware Router is then developed by\nbuilding a novel gating function and injecting learnable domain-aware prompts.\nExtensive experiments show that our DM-Adapter achieves state-of-the-art\nperformance, outperforming previous methods by a significant margin.\n","authors":["Yating Liu","Zimo Liu","Xiangyuan Lan","Wenming Yang","Yaowei Li","Qingmin Liao"],"pdf_url":"https://arxiv.org/pdf/2503.04144v1.pdf","comment":"9 pages, 5 figures, accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2503.04143v1","updated":"2025-03-06T06:41:17Z","published":"2025-03-06T06:41:17Z","title":"MTS: A Deep Reinforcement Learning Portfolio Management Framework with\n Time-Awareness and Short-Selling","summary":" Portfolio management remains a crucial challenge in finance, with traditional\nmethods often falling short in complex and volatile market environments. While\ndeep reinforcement approaches have shown promise, they still face limitations\nin dynamic risk management, exploitation of temporal markets, and incorporation\nof complex trading strategies such as short-selling. These limitations can lead\nto suboptimal portfolio performance, increased vulnerability to market\nvolatility, and missed opportunities in capturing potential returns from\ndiverse market conditions. This paper introduces a Deep Reinforcement Learning\nPortfolio Management Framework with Time-Awareness and Short-Selling (MTS),\noffering a robust and adaptive strategy for sustainable investment performance.\nThis framework utilizes a novel encoder-attention mechanism to address the\nlimitations by incorporating temporal market characteristics, a parallel\nstrategy for automated short-selling based on market trends, and risk\nmanagement through innovative Incremental Conditional Value at Risk, enhancing\nadaptability and performance. Experimental validation on five diverse datasets\nfrom 2019 to 2023 demonstrates MTS's superiority over traditional algorithms\nand advanced machine learning techniques. MTS consistently achieves higher\ncumulative returns, Sharpe, Omega, and Sortino ratios, underscoring its\neffectiveness in balancing risk and return while adapting to market dynamics.\nMTS demonstrates an average relative increase of 30.67% in cumulative returns\nand 29.33% in Sharpe ratio compared to the next best-performing strategies\nacross various datasets.\n","authors":["Fengchen Gu","Zhengyong Jiang","Ángel F. García-Fernández","Angelos Stefanidis","Jionglong Su","Huakang Li"],"pdf_url":"https://arxiv.org/pdf/2503.04143v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17635v2","updated":"2025-03-06T06:39:56Z","published":"2024-10-23T07:53:29Z","title":"Markov Chain of Thought for Efficient Mathematical Reasoning","summary":" Chain of Thought (CoT) of multi-step benefits from the logical structure of\nthe reasoning steps and task-specific actions, significantly enhancing the\nmathematical reasoning capabilities of large language models. As the prevalence\nof long CoT, the number of reasoning steps exceeds manageable token limits and\nleads to higher computational demands. Inspired by the fundamental logic of\nhuman cognition, \"derive, then reduce\", we conceptualize the standard\nmulti-step CoT as a novel Markov Chain of Thought (MCoT). In this study, we\nconsider the mathematical reasoning task, defining each reasoning step as text\naccompanied by a Python code snippet. To facilitate a longer reasoning path,\nself-correction is enabled through interactions with the code interpreter. Our\nMCoT aims to compress previous reasoning steps into a simplified question,\nenabling efficient next-step inference without relying on a lengthy KV cache.\nIn our experiments, we curate the $\\texttt{MCoTInstruct}$ dataset, and the\nempirical results indicate that MCoT not only significantly enhances efficiency\nbut also maintains comparable accuracy. While much remains to be explored, this\nwork paves the way for exploring the long CoT reasoning abilities of LLMs. The\ncode is available at https://github.com/james-yw/Markov-Chain-of-Thought\n","authors":["Wen Yang","Minpeng Liao","Kai Fan"],"pdf_url":"https://arxiv.org/pdf/2410.17635v2.pdf","comment":"Camera ready version for NAACL 2025 Main"},{"id":"http://arxiv.org/abs/2502.14074v2","updated":"2025-03-06T06:32:54Z","published":"2025-02-19T19:59:16Z","title":"Investigating Non-Transitivity in LLM-as-a-Judge","summary":" Automatic evaluation methods based on large language models (LLMs) are\nemerging as the standard tool for assessing the instruction-following abilities\nof LLM-based agents. The most common method in this paradigm, pairwise\ncomparisons with a baseline model, critically depends on the assumption of\ntransitive preferences. However, the validity of this assumption remains\nlargely unexplored. In this study, we investigate the presence of\nnon-transitivity within the AlpacaEval framework and analyze its effects on\nmodel rankings. We find that LLM judges exhibit non-transitive preferences,\nleading to rankings that are sensitive to the choice of the baseline model. To\nmitigate this issue, we show that round-robin tournaments combined with\nBradley-Terry models of preference can produce more reliable rankings. Notably,\nour method increases both the Spearman correlation and the Kendall correlation\nwith Chatbot Arena (95.0% -> 96.4% and 82.1% -> 86.3% respectively). To address\nthe computational cost of round-robin tournaments, we propose Swiss-Wise\nIterative Matchmaking (Swim) tournaments, using a dynamic matching strategy to\ncapture the benefits of round-robin tournaments while maintaining computational\nefficiency.\n","authors":["Yi Xu","Laura Ruis","Tim Rocktäschel","Robert Kirk"],"pdf_url":"https://arxiv.org/pdf/2502.14074v2.pdf","comment":"8 pages, 6 figures, 2 tables (30 pages, 11 figures, 8 tables\n including references and appendices)"},{"id":"http://arxiv.org/abs/2503.04128v1","updated":"2025-03-06T06:14:27Z","published":"2025-03-06T06:14:27Z","title":"Artificial Intelligence in Pronunciation Teaching: Use and Beliefs of\n Foreign Language Teachers","summary":" Pronunciation instruction in foreign language classrooms has often been an\noverlooked area of focus. With the widespread adoption of Artificial\nIntelligence (AI) and its potential benefits, investigating how AI is utilized\nin pronunciation teaching and understanding the beliefs of teachers about this\ntool is essential for improving learning outcomes. This study aims to examine\nhow AI use for pronunciation instruction varies across different demographic\nand professional factors among teachers, and how these factors, including AI\nuse, influence the beliefs of teachers about AI. The study involved 117 English\nas a Foreign Language (EFL) in-service teachers working in Cyprus, who\ncompleted an online survey designed to assess their beliefs about the\neffectiveness of AI, its drawbacks, and their willingness to integrate AI into\ntheir teaching practices. The results revealed that teachers were significantly\nmore likely to agree on the perceived effectiveness of AI and their willingness\nto adopt it, compared to their concerns about its use. Furthermore, teachers\nworking in higher education and adult education, as well as those who had\nreceived more extensive training, reported using AI more frequently in their\nteaching. Teachers who utilized AI more often expressed stronger agreement with\nits effectiveness, while those who had received more training were less likely\nto express concerns about its integration. Given the limited training that many\nteachers currently receive, these findings demonstrate the need for tailored\ntraining sessions that address the specific needs and concerns of educators,\nultimately fostering the adoption of AI in pronunciation instruction.\n","authors":["Georgios P. Georgiou"],"pdf_url":"https://arxiv.org/pdf/2503.04128v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.18125v3","updated":"2025-03-06T06:10:12Z","published":"2024-10-16T07:45:31Z","title":"Towards Edge General Intelligence via Large Language Models:\n Opportunities and Challenges","summary":" Edge Intelligence (EI) has been instrumental in delivering real-time,\nlocalized services by leveraging the computational capabilities of edge\nnetworks. The integration of Large Language Models (LLMs) empowers EI to evolve\ninto the next stage: Edge General Intelligence (EGI), enabling more adaptive\nand versatile applications that require advanced understanding and reasoning\ncapabilities. However, systematic exploration in this area remains\ninsufficient. This survey delineates the distinctions between EGI and\ntraditional EI, categorizing LLM-empowered EGI into three conceptual systems:\ncentralized, hybrid, and decentralized. For each system, we detail the\nframework designs and review existing implementations. Furthermore, we evaluate\nthe performance and throughput of various Small Language Models (SLMs) that are\nmore suitable for development on edge devices. This survey provides researchers\nwith a comprehensive vision of EGI, offering insights into its vast potential\nand establishing a foundation for future advancements in this rapidly evolving\nfield.\n","authors":["Handi Chen","Weipeng Deng","Shuo Yang","Jinfeng Xu","Zhihan Jiang","Edith C. H. Ngai","Jiangchuan Liu","Xue Liu"],"pdf_url":"https://arxiv.org/pdf/2410.18125v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04121v1","updated":"2025-03-06T05:58:41Z","published":"2025-03-06T05:58:41Z","title":"Simple Self Organizing Map with Visual Transformer","summary":" Vision Transformers (ViTs) have demonstrated exceptional performance in\nvarious vision tasks. However, they tend to underperform on smaller datasets\ndue to their inherent lack of inductive biases. Current approaches address this\nlimitation implicitly-often by pairing ViTs with pretext tasks or by distilling\nknowledge from convolutional neural networks (CNNs) to strengthen the prior. In\ncontrast, Self-Organizing Maps (SOMs), a widely adopted self-supervised\nframework, are inherently structured to preserve topology and spatial\norganization, making them a promising candidate to directly address the\nlimitations of ViTs in limited or small training datasets. Despite this\npotential, equipping SOMs with modern deep learning architectures remains\nlargely unexplored. In this study, we conduct a novel exploration on how Vision\nTransformers (ViTs) and Self-Organizing Maps (SOMs) can empower each other,\naiming to bridge this critical research gap. Our findings demonstrate that\nthese architectures can synergistically enhance each other, leading to\nsignificantly improved performance in both unsupervised and supervised tasks.\nCode will be publicly available.\n","authors":["Alan Luo","Kaiwen Yuan"],"pdf_url":"https://arxiv.org/pdf/2503.04121v1.pdf","comment":"5 pages, 4 figures. Submitted to IEEE. All experiments and code work\n were performed by the first author, with the second author serving in a\n PI/mentor role, guiding the progression of the work"},{"id":"http://arxiv.org/abs/2411.18104v2","updated":"2025-03-06T05:54:29Z","published":"2024-11-27T07:32:56Z","title":"Training and Evaluating Language Models with Template-based Data\n Generation","summary":" The rapid advancement of large language models (LLMs) such as GPT-3, PaLM,\nand Llama has significantly transformed natural language processing, showcasing\nremarkable capabilities in understanding and generating language. However,\nthese models often struggle with tasks requiring complex reasoning,\nparticularly in mathematical problem-solving, due in part to the scarcity of\nlarge-scale, high-quality, domain-specific datasets necessary for training\nsophisticated reasoning abilities. To address this limitation, we introduce\nTemplate-based Data Generation (TDG), a novel approach that leverages LLMs\n(GPT-4) to automatically generate parameterized meta-templates, which are then\nused to synthesize a vast array of high-quality problems and solutions.\nLeveraging TDG, we create TemplateMath Part I: TemplateGSM, a dataset\ncomprising over 7 million synthetically generated grade school math\nproblems--each accompanied by code-based and natural language solutions--with\nthe potential to generate an effectively unlimited number more. This dataset\nalleviates the scarcity of large-scale mathematical datasets and serves as a\nvaluable resource for pre-training, fine-tuning, and evaluating LLMs in\nmathematical reasoning. Our method not only enables the generation of virtually\ninfinite data but also elevates data augmentation to a new level by using GPT-4\nfor meta-template generation, ensuring diverse and high-quality problem\nstructures. The TemplateMath Part I: TemplateGSM dataset is publicly available\nat https://huggingface.co/datasets/math-ai/TemplateGSM. The code is available\nat https://github.com/iiis-ai/TemplateMath.\n","authors":["Yifan Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.18104v2.pdf","comment":"9 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.07055v2","updated":"2025-03-06T05:48:54Z","published":"2024-09-11T07:01:08Z","title":"Legal Fact Prediction: The Missing Piece in Legal Judgment Prediction","summary":" Legal judgment prediction (LJP), which enables litigants and their lawyers to\nforecast judgment outcomes and refine litigation strategies, has emerged as a\ncrucial legal NLP task. Existing studies typically utilize legal facts, i.e.,\nfacts that have been established by evidence and determined by the judge, to\npredict the judgment. However, legal facts are often difficult to obtain in the\nearly stages of litigation, significantly limiting the practical applicability\nof fact-based LJP. To address this limitation, we propose a novel legal NLP\ntask: \\textit{legal fact prediction} (LFP), which takes the evidence submitted\nby litigants for trial as input to predict legal facts, thereby empowering\nfact-based LJP technologies to perform prediction in the absence of\nground-truth legal facts. We also propose the first benchmark dataset,\nLFPBench, for evaluating the LFP task. Our extensive experiments on LFPBench\ndemonstrate the effectiveness of LFP-empowered LJP and highlight promising\nresearch directions for LFP. Our code and data are available at\nhttps://github.com/HPRCEST/LFPBench.\n","authors":["Junkai Liu","Yujie Tong","Hui Huang","Bowen Zheng","Yiran Hu","Peicheng Wu","Chuan Xiao","Makoto Onizuka","Muyun Yang","Shuyuan Zheng"],"pdf_url":"https://arxiv.org/pdf/2409.07055v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02398v2","updated":"2025-03-06T05:46:40Z","published":"2024-11-04T18:59:51Z","title":"Prompting with Phonemes: Enhancing LLMs' Multilinguality for Non-Latin\n Script Languages","summary":" Although multilingual LLMs have achieved remarkable performance across\nbenchmarks, we find they continue to underperform on non-Latin script languages\nacross contemporary LLM families. This discrepancy arises from the fact that\nLLMs are pretrained with orthographic scripts, which are dominated by Latin\ncharacters that obscure their shared phonology with non-Latin scripts. We\npropose leveraging phonemic transcriptions as complementary signals to induce\nscript-invariant representations. Our study demonstrates that integrating\nphonemic signals improves performance across both non-Latin and Latin script\nlanguages, with a particularly significant impact on closing the performance\ngap between the two. Through detailed experiments, we show that phonemic and\northographic scripts retrieve distinct examples for in-context learning (ICL).\nThis motivates our proposed Mixed-ICL retrieval strategy, where further\naggregation from both leads to our significant performance improvements for\nboth Latin script languages (up to 12.6%) and non-Latin script languages (up to\n15.1%) compared to randomized ICL retrieval.\n","authors":["Hoang H Nguyen","Khyati Mahajan","Vikas Yadav","Julian Salazar","Philip S. Yu","Masoud Hashemi","Rishabh Maheshwary"],"pdf_url":"https://arxiv.org/pdf/2411.02398v2.pdf","comment":"Accepted for NAACL 2025 (Main Conference)"},{"id":"http://arxiv.org/abs/2402.04355v2","updated":"2025-03-06T05:43:48Z","published":"2024-02-06T19:39:26Z","title":"PQMass: Probabilistic Assessment of the Quality of Generative Models\n using Probability Mass Estimation","summary":" We propose a likelihood-free method for comparing two distributions given\nsamples from each, with the goal of assessing the quality of generative models.\nThe proposed approach, PQMass, provides a statistically rigorous method for\nassessing the performance of a single generative model or the comparison of\nmultiple competing models. PQMass divides the sample space into non-overlapping\nregions and applies chi-squared tests to the number of data samples that fall\nwithin each region, giving a p-value that measures the probability that the bin\ncounts derived from two sets of samples are drawn from the same multinomial\ndistribution. PQMass does not depend on assumptions regarding the density of\nthe true distribution, nor does it rely on training or fitting any auxiliary\nmodels. We evaluate PQMass on data of various modalities and dimensions,\ndemonstrating its effectiveness in assessing the quality, novelty, and\ndiversity of generated samples. We further show that PQMass scales well to\nmoderately high-dimensional data and thus obviates the need for feature\nextraction in practical applications.\n","authors":["Pablo Lemos","Sammy Sharief","Nikolay Malkin","Salma Salhi","Conner Stone","Laurence Perreault-Levasseur","Yashar Hezaveh"],"pdf_url":"https://arxiv.org/pdf/2402.04355v2.pdf","comment":"Published as a conference paper at ICLR 2025"},{"id":"http://arxiv.org/abs/2410.16024v3","updated":"2025-03-06T05:38:52Z","published":"2024-10-21T13:58:38Z","title":"SMAC-R1: The Emergence of Intelligence in Decision-Making Tasks","summary":" StarCraft Multi-Agent Challenge (SMAC) has been one of the most commonly used\nexperimental environments in multi-agent reinforcement learning (MARL), where\nthe specific task is to control a set number of allied units to defeat enemy\nforces. Traditional MARL algorithms often require interacting with the\nenvironment for millions of steps to train a parametric model, of which the\nresulting policies are typically non-interpretable with weak transferability.\nIn this paper, we introduce SMAC-R1 which is based on the Qwen2.5-7B-Base LLM\ndistilled from DeepSeek-Coder-v2.5-236B. Similar to online reinforcement\nlearning after behavior cloning in offline learning process, in our pipeline,\nagents leverage the DeepSeek LLM to generate decision tree code by providing\ntask descriptions, and the agents are further self-reflected using feedback\nfrom the rewards provided by the environment. Based on that, we augment the\ngenerated scripts to fine-tune a small LLM, Qwen2.5-7B-Base, to distill the\ndecision-making ability via Supervised Fine-Tuning (SFT) and enhance the script\ngeneration ability by the Group Relative Policy Optimization (GRPO) algorithm.\nWe conduct experiments in the original 23 SMAC tasks and 10 newly-designed\ntasks to demonstrate that our method can produce high-quality, interpretable\ndecision trees with minimal environmental exploration. Moreover, these scripts\nexhibit strong transferability, successfully applying to homogeneous SMAC\nenvironments without modification. We believe this approach offers a new\ndirection for solving decision-making tasks and domain-specific LLM training\npipelines in the future.\n","authors":["Yue Deng","Weiyu Ma","Yuxin Fan","Ruyi Song","Yin Zhang","Haifeng Zhang","Jian Zhao"],"pdf_url":"https://arxiv.org/pdf/2410.16024v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04111v1","updated":"2025-03-06T05:36:35Z","published":"2025-03-06T05:36:35Z","title":"Generalizability of Neural Networks Minimizing Empirical Risk Based on\n Expressive Ability","summary":" The primary objective of learning methods is generalization. Classic uniform\ngeneralization bounds, which rely on VC-dimension or Rademacher complexity,\nfail to explain the significant attribute that over-parameterized models in\ndeep learning exhibit nice generalizability. On the other hand,\nalgorithm-dependent generalization bounds, like stability bounds, often rely on\nstrict assumptions. To establish generalizability under less stringent\nassumptions, this paper investigates the generalizability of neural networks\nthat minimize or approximately minimize empirical risk. We establish a lower\nbound for population accuracy based on the expressiveness of these networks,\nwhich indicates that with an adequate large number of training samples and\nnetwork sizes, these networks, including over-parameterized ones, can\ngeneralize effectively. Additionally, we provide a necessary condition for\ngeneralization, demonstrating that, for certain data distributions, the\nquantity of training data required to ensure generalization exceeds the network\nsize needed to represent the corresponding data distribution. Finally, we\nprovide theoretical insights into several phenomena in deep learning, including\nrobust generalization, importance of over-parameterization, and effect of loss\nfunction on generalization.\n","authors":["Lijia Yu","Yibo Miao","Yifan Zhu","Xiao-Shan Gao","Lijun Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.04111v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04355v2","updated":"2025-03-06T05:43:48Z","published":"2024-02-06T19:39:26Z","title":"PQMass: Probabilistic Assessment of the Quality of Generative Models\n using Probability Mass Estimation","summary":" We propose a likelihood-free method for comparing two distributions given\nsamples from each, with the goal of assessing the quality of generative models.\nThe proposed approach, PQMass, provides a statistically rigorous method for\nassessing the performance of a single generative model or the comparison of\nmultiple competing models. PQMass divides the sample space into non-overlapping\nregions and applies chi-squared tests to the number of data samples that fall\nwithin each region, giving a p-value that measures the probability that the bin\ncounts derived from two sets of samples are drawn from the same multinomial\ndistribution. PQMass does not depend on assumptions regarding the density of\nthe true distribution, nor does it rely on training or fitting any auxiliary\nmodels. We evaluate PQMass on data of various modalities and dimensions,\ndemonstrating its effectiveness in assessing the quality, novelty, and\ndiversity of generated samples. We further show that PQMass scales well to\nmoderately high-dimensional data and thus obviates the need for feature\nextraction in practical applications.\n","authors":["Pablo Lemos","Sammy Sharief","Nikolay Malkin","Salma Salhi","Connor Stone","Laurence Perreault-Levasseur","Yashar Hezaveh"],"pdf_url":"https://arxiv.org/pdf/2402.04355v2.pdf","comment":"Published as a conference paper at ICLR 2025"}],"Genomics":[{"id":"http://arxiv.org/abs/2503.04490v1","updated":"2025-03-06T14:38:20Z","published":"2025-03-06T14:38:20Z","title":"Large Language Models in Bioinformatics: A Survey","summary":" Large Language Models (LLMs) are revolutionizing bioinformatics, enabling\nadvanced analysis of DNA, RNA, proteins, and single-cell data. This survey\nprovides a systematic review of recent advancements, focusing on genomic\nsequence modeling, RNA structure prediction, protein function inference, and\nsingle-cell transcriptomics. Meanwhile, we also discuss several key challenges,\nincluding data scarcity, computational complexity, and cross-omics integration,\nand explore future directions such as multimodal learning, hybrid AI models,\nand clinical applications. By offering a comprehensive perspective, this paper\nunderscores the transformative potential of LLMs in driving innovations in\nbioinformatics and precision medicine.\n","authors":["Zhenyu Wang","Zikang Wang","Jiyue Jiang","Pengan Chen","Xiangyu Shi","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2503.04490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04347v1","updated":"2025-03-06T11:43:30Z","published":"2025-03-06T11:43:30Z","title":"Large Language Models for Zero-shot Inference of Causal Structures in\n Biology","summary":" Genes, proteins and other biological entities influence one another via\ncausal molecular networks. Causal relationships in such networks are mediated\nby complex and diverse mechanisms, through latent variables, and are often\nspecific to cellular context. It remains challenging to characterise such\nnetworks in practice. Here, we present a novel framework to evaluate large\nlanguage models (LLMs) for zero-shot inference of causal relationships in\nbiology. In particular, we systematically evaluate causal claims obtained from\nan LLM using real-world interventional data. This is done over one hundred\nvariables and thousands of causal hypotheses. Furthermore, we consider several\nprompting and retrieval-augmentation strategies, including large, and\npotentially conflicting, collections of scientific articles. Our results show\nthat with tailored augmentation and prompting, even relatively small LLMs can\ncapture meaningful aspects of causal structure in biological systems. This\nsupports the notion that LLMs could act as orchestration tools in biological\ndiscovery, by helping to distil current knowledge in ways amenable to\ndownstream analysis. Our approach to assessing LLMs with respect to\nexperimental data is relevant for a broad range of problems at the intersection\nof causal learning, LLMs and scientific discovery.\n","authors":["Izzy Newsham","Luka Kovačević","Richard Moulange","Nan Rosemary Ke","Sach Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2503.04347v1.pdf","comment":"ICLR 2025 Workshop on Machine Learning for Genomics Explorations"},{"id":"http://arxiv.org/abs/2502.07272v2","updated":"2025-03-06T05:41:32Z","published":"2025-02-11T05:39:49Z","title":"GENERator: A Long-Context Generative Genomic Foundation Model","summary":" Advancements in DNA sequencing technologies have significantly improved our\nability to decode genomic sequences. However, the prediction and interpretation\nof these sequences remain challenging due to the intricate nature of genetic\nmaterial. Large language models (LLMs) have introduced new opportunities for\nbiological sequence analysis. Recent developments in genomic language models\nhave underscored the potential of LLMs in deciphering DNA sequences.\nNonetheless, existing models often face limitations in robustness and\napplication scope, primarily due to constraints in model structure and training\ndata scale. To address these limitations, we present GENERator, a generative\ngenomic foundation model featuring a context length of 98k base pairs (bp) and\n1.2B parameters. Trained on an expansive dataset comprising 386B bp of\neukaryotic DNA, the GENERator demonstrates state-of-the-art performance across\nboth established and newly proposed benchmarks. The model adheres to the\ncentral dogma of molecular biology, accurately generating protein-coding\nsequences that translate into proteins structurally analogous to known\nfamilies. It also shows significant promise in sequence optimization,\nparticularly through the prompt-responsive generation of enhancer sequences\nwith specific activity profiles. These capabilities position the GENERator as a\npivotal tool for genomic research and biotechnological advancement, enhancing\nour ability to interpret and predict complex biological systems and enabling\nprecise genomic interventions. Implementation details and supplementary\nresources are available at https://github.com/GenerTeam/GENERator.\n","authors":["Wei Wu","Qiuyi Li","Mingyang Li","Kun Fu","Fuli Feng","Jieping Ye","Hui Xiong","Zheng Wang"],"pdf_url":"https://arxiv.org/pdf/2502.07272v2.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2503.04725v1","updated":"2025-03-06T18:59:48Z","published":"2025-03-06T18:59:48Z","title":"L$^2$M: Mutual Information Scaling Law for Long-Context Language\n Modeling","summary":" We rigorously establish a bipartite mutual information scaling law in natural\nlanguage that governs long-range dependencies. This scaling law, which we show\nis distinct from and scales independently of the conventional two-point mutual\ninformation, is the key to understanding long-context language modeling. Using\nthis scaling law, we formulate the Long-context Language Modeling (L$^2$M)\ncondition, which relates a model's capacity for effective long context length\nmodeling to the scaling of its latent state size for storing past information.\nOur results are validated through experiments on both transformers and state\nspace models. This work establishes a theoretical foundation that guides the\ndevelopment of large language models toward longer context lengths.\n","authors":["Zhuo Chen","Oriol Mayné i Comas","Zhuotao Jin","Di Luo","Marin Soljačić"],"pdf_url":"https://arxiv.org/pdf/2503.04725v1.pdf","comment":"29 pages, 12 figures, 1 table"},{"id":"http://arxiv.org/abs/2503.04722v1","updated":"2025-03-06T18:59:23Z","published":"2025-03-06T18:59:23Z","title":"Enough Coin Flips Can Make LLMs Act Bayesian","summary":" Large language models (LLMs) exhibit the ability to generalize given few-shot\nexamples in their input prompt, an emergent capability known as in-context\nlearning (ICL). We investigate whether LLMs utilize ICL to perform structured\nreasoning in ways that are consistent with a Bayesian framework or rely on\npattern matching. Using a controlled setting of biased coin flips, we find\nthat: (1) LLMs often possess biased priors, causing initial divergence in\nzero-shot settings, (2) in-context evidence outweighs explicit bias\ninstructions, (3) LLMs broadly follow Bayesian posterior updates, with\ndeviations primarily due to miscalibrated priors rather than flawed updates,\nand (4) attention magnitude has negligible effect on Bayesian inference. With\nsufficient demonstrations of biased coin flips via ICL, LLMs update their\npriors in a Bayesian manner.\n","authors":["Ritwik Gupta","Rodolfo Corona","Jiaxin Ge","Eric Wang","Dan Klein","Trevor Darrell","David M. Chan"],"pdf_url":"https://arxiv.org/pdf/2503.04722v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04718v1","updated":"2025-03-06T18:58:45Z","published":"2025-03-06T18:58:45Z","title":"Floxels: Fast Unsupervised Voxel Based Scene Flow Estimation","summary":" Scene flow estimation is a foundational task for many robotic applications,\nincluding robust dynamic object detection, automatic labeling, and sensor\nsynchronization. Two types of approaches to the problem have evolved: 1)\nSupervised and 2) optimization-based methods. Supervised methods are fast\nduring inference and achieve high-quality results, however, they are limited by\nthe need for large amounts of labeled training data and are susceptible to\ndomain gaps. In contrast, unsupervised test-time optimization methods do not\nface the problem of domain gaps but usually suffer from substantial runtime,\nexhibit artifacts, or fail to converge to the right solution. In this work, we\nmitigate several limitations of existing optimization-based methods. To this\nend, we 1) introduce a simple voxel grid-based model that improves over the\nstandard MLP-based formulation in multiple dimensions and 2) introduce a new\nmultiframe loss formulation. 3) We combine both contributions in our new\nmethod, termed Floxels. On the Argoverse 2 benchmark, Floxels is surpassed only\nby EulerFlow among unsupervised methods while achieving comparable performance\nat a fraction of the computational cost. Floxels achieves a massive speedup of\nmore than ~60 - 140x over EulerFlow, reducing the runtime from a day to 10\nminutes per sequence. Over the faster but low-quality baseline, NSFP, Floxels\nachieves a speedup of ~14x.\n","authors":["David T. Hoffmann","Syed Haseeb Raza","Hanqiu Jiang","Denis Tananaev","Steffen Klingenhoefer","Martin Meinke"],"pdf_url":"https://arxiv.org/pdf/2503.04718v1.pdf","comment":"Accepted at CVPR 2025"},{"id":"http://arxiv.org/abs/2503.04715v1","updated":"2025-03-06T18:58:29Z","published":"2025-03-06T18:58:29Z","title":"Predictable Scale: Part I -- Optimal Hyperparameter Scaling Law in Large\n Language Model Pretraining","summary":" The impressive capabilities of Large Language Models (LLMs) across diverse\ntasks are now well-established, yet their effective deployment necessitates\ncareful hyperparameter optimization. Through extensive empirical studies\ninvolving grid searches across diverse configurations, we discover universal\nscaling laws governing these hyperparameters: optimal learning rate follows a\npower-law relationship with both model parameters and data sizes, while optimal\nbatch size scales primarily with data sizes. Our analysis reveals a convex\noptimization landscape for hyperparameters under fixed models and data size\nconditions. This convexity implies an optimal hyperparameter plateau. We\ncontribute a universal, plug-and-play optimal hyperparameter tool for the\ncommunity. Its estimated values on the test set are merely 0.07\\% away from the\nglobally optimal LLM performance found via an exhaustive search. These laws\ndemonstrate remarkable robustness across variations in model sparsity, training\ndata distribution, and model shape. To our best known, this is the first work\nthat unifies different model shapes and structures, such as Mixture-of-Experts\nmodels and dense transformers, as well as establishes optimal hyperparameter\nscaling laws across diverse data distributions. This exhaustive optimization\nprocess demands substantial computational resources, utilizing nearly one\nmillion NVIDIA H800 GPU hours to train 3,700 LLMs of varying sizes and\nhyperparameters from scratch and consuming approximately 100 trillion tokens in\ntotal. To facilitate reproducibility and further research, we will\nprogressively release all loss measurements and model checkpoints through our\ndesignated repository https://step-law.github.io/\n","authors":["Houyi Li","Wenzheng Zheng","Jingcheng Hu","Qiufeng Wang","Hanshan Zhang","Zili Wang","Yangshijie Xu","Shuigeng Zhou","Xiangyu Zhang","Daxin Jiang"],"pdf_url":"https://arxiv.org/pdf/2503.04715v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2503.04713v1","updated":"2025-03-06T18:57:40Z","published":"2025-03-06T18:57:40Z","title":"Scaling Rich Style-Prompted Text-to-Speech Datasets","summary":" We introduce Paralinguistic Speech Captions (ParaSpeechCaps), a large-scale\ndataset that annotates speech utterances with rich style captions. While rich\nabstract tags (e.g. guttural, nasal, pained) have been explored in small-scale\nhuman-annotated datasets, existing large-scale datasets only cover basic tags\n(e.g. low-pitched, slow, loud). We combine off-the-shelf text and speech\nembedders, classifiers and an audio language model to automatically scale rich\ntag annotations for the first time. ParaSpeechCaps covers a total of 59 style\ntags, including both speaker-level intrinsic tags and utterance-level\nsituational tags. It consists of 342 hours of human-labelled data (PSC-Base)\nand 2427 hours of automatically annotated data (PSC-Scaled). We finetune\nParler-TTS, an open-source style-prompted TTS model, on ParaSpeechCaps, and\nachieve improved style consistency (+7.9% Consistency MOS) and speech quality\n(+15.5% Naturalness MOS) over the best performing baseline that combines\nexisting rich style tag datasets. We ablate several of our dataset design\nchoices to lay the foundation for future work in this space. Our dataset,\nmodels and code are released at https://github.com/ajd12342/paraspeechcaps .\n","authors":["Anuj Diwan","Zhisheng Zheng","David Harwath","Eunsol Choi"],"pdf_url":"https://arxiv.org/pdf/2503.04713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04712v1","updated":"2025-03-06T18:57:34Z","published":"2025-03-06T18:57:34Z","title":"Efficiently Escaping Saddle Points under Generalized Smoothness via\n Self-Bounding Regularity","summary":" In this paper, we study the problem of non-convex optimization on functions\nthat are not necessarily smooth using first order methods. Smoothness\n(functions whose gradient and/or Hessian are Lipschitz) is not satisfied by\nmany machine learning problems in both theory and practice, motivating a recent\nline of work studying the convergence of first order methods to first order\nstationary points under appropriate generalizations of smoothness.\n We develop a novel framework to study convergence of first order methods to\nfirst and \\textit{second} order stationary points under generalized smoothness,\nunder more general smoothness assumptions than the literature. Using our\nframework, we show appropriate variants of GD and SGD (e.g. with appropriate\nperturbations) can converge not just to first order but also \\textit{second\norder stationary points} in runtime polylogarithmic in the dimension. To our\nknowledge, our work contains the first such result, as well as the first\n'non-textbook' rate for non-convex optimization under generalized smoothness.\nWe demonstrate that several canonical non-convex optimization problems fall\nunder our setting and framework.\n","authors":["Daniel Yiming Cao","August Y. Chen","Karthik Sridharan","Benjamin Tang"],"pdf_url":"https://arxiv.org/pdf/2503.04712v1.pdf","comment":"79 pages"},{"id":"http://arxiv.org/abs/2503.04706v1","updated":"2025-03-06T18:54:42Z","published":"2025-03-06T18:54:42Z","title":"Sample-Optimal Agnostic Boosting with Unlabeled Data","summary":" Boosting provides a practical and provably effective framework for\nconstructing accurate learning algorithms from inaccurate rules of thumb. It\nextends the promise of sample-efficient learning to settings where direct\nEmpirical Risk Minimization (ERM) may not be implementable efficiently. In the\nrealizable setting, boosting is known to offer this computational reprieve\nwithout compromising on sample efficiency. However, in the agnostic case,\nexisting boosting algorithms fall short of achieving the optimal sample\ncomplexity.\n This paper highlights an unexpected and previously unexplored avenue of\nimprovement: unlabeled samples. We design a computationally efficient agnostic\nboosting algorithm that matches the sample complexity of ERM, given\npolynomially many additional unlabeled samples. In fact, we show that the total\nnumber of samples needed, unlabeled and labeled inclusive, is never more than\nthat for the best known agnostic boosting algorithm -- so this result is never\nworse -- while only a vanishing fraction of these need to be labeled for the\nalgorithm to succeed. This is particularly fortuitous for learning-theoretic\napplications of agnostic boosting, which often take place in the\ndistribution-specific setting, where unlabeled samples can be availed for free.\nWe detail other applications of this result in reinforcement learning.\n","authors":["Udaya Ghai","Karan Singh"],"pdf_url":"https://arxiv.org/pdf/2503.04706v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04704v1","updated":"2025-03-06T18:54:32Z","published":"2025-03-06T18:54:32Z","title":"Universality of Layer-Level Entropy-Weighted Quantization Beyond Model\n Architecture and Size","summary":" We present a novel approach to selective model quantization that transcends\nthe limitations of architecture-specific and size-dependent compression methods\nfor Large Language Models (LLMs) using Entropy-Weighted Quantization (EWQ). By\nanalyzing the entropy distribution across transformer blocks, EWQ determines\nwhich blocks can be safely quantized without causing significant performance\ndegradation, independent of model architecture or size. Our method outperforms\nuniform quantization approaches, maintaining Massive Multitask Language\nUnderstanding (MMLU) accuracy scores within 0.5% of unquantized models while\nreducing memory usage by up to 18%. We demonstrate the effectiveness of EWQ\nacross multiple architectures-from 1.6B to 70B parameters-showcasing consistent\nimprovements in the quality-compression trade-off regardless of model scale or\narchitectural design. A surprising finding of EWQ is its ability to reduce\nperplexity compared to unquantized models, suggesting the presence of\nbeneficial regularization through selective precision reduction. This\nimprovement holds across different model families, indicating a fundamental\nrelationship between layer-level entropy and optimal precision requirements.\nAdditionally, we introduce FastEWQ, a rapid method for entropy distribution\nanalysis that eliminates the need for loading model weights. This technique\nleverages universal characteristics of entropy distribution that persist across\nvarious architectures and scales, enabling near-instantaneous quantization\ndecisions while maintaining 80% classification accuracy with full entropy\nanalysis. Our results demonstrate that effective quantization strategies can be\ndeveloped independently of specific architectural choices or model sizes,\nopening new possibilities for efficient LLM deployment.\n","authors":["Alireza Behtash","Marijan Fofonjka","Ethan Baird","Tyler Mauer","Hossein Moghimifam","David Stout","Joel Dennison"],"pdf_url":"https://arxiv.org/pdf/2503.04704v1.pdf","comment":"29 pages, 7 figures, 14 tables; Comments are welcome"},{"id":"http://arxiv.org/abs/2503.04697v1","updated":"2025-03-06T18:43:29Z","published":"2025-03-06T18:43:29Z","title":"L1: Controlling How Long A Reasoning Model Thinks With Reinforcement\n Learning","summary":" Reasoning language models have shown an uncanny ability to improve\nperformance at test-time by ``thinking longer''-that is, by generating longer\nchain-of-thought sequences and hence using more compute. However, the length of\ntheir chain-of-thought reasoning is not controllable, making it impossible to\nallocate test-time compute to achieve a desired level of performance. We\nintroduce Length Controlled Policy Optimization (LCPO), a simple reinforcement\nlearning method that optimizes for accuracy and adherence to user-specified\nlength constraints. We use LCPO to train L1, a reasoning language model that\nproduces outputs satisfying a length constraint given in its prompt. L1's\nlength control allows for smoothly trading off computational cost and accuracy\non a wide range of tasks, and outperforms the state-of-the-art S1 method for\nlength control. Furthermore, we uncover an unexpected short chain-of-thought\ncapability in models trained with LCPO. For instance, our 1.5B L1 model\nsurpasses GPT-4o at equal reasoning lengths. Overall, LCPO enables precise\ncontrol over reasoning length, allowing for fine-grained allocation of\ntest-time compute and accuracy. We release code and models at\nhttps://www.cmu-l3.github.io/l1\n","authors":["Pranjal Aggarwal","Sean Welleck"],"pdf_url":"https://arxiv.org/pdf/2503.04697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.01843v2","updated":"2025-03-06T18:38:33Z","published":"2025-03-03T18:59:40Z","title":"When Can You Get Away with Low Memory Adam?","summary":" Adam is the go-to optimizer for training modern machine learning models, but\nit requires additional memory to maintain the moving averages of the gradients\nand their squares. While various low-memory optimizers have been proposed that\nsometimes match the performance of Adam, their lack of reliability has left\nAdam as the default choice. In this work, we apply a simple layer-wise\nSignal-to-Noise Ratio (SNR) analysis to quantify when second-moment tensors can\nbe effectively replaced by their means across different dimensions. Our SNR\nanalysis reveals how architecture, training hyperparameters, and dataset\nproperties impact compressibility along Adam's trajectory, naturally leading to\n$\\textit{SlimAdam}$, a memory-efficient Adam variant. $\\textit{SlimAdam}$\ncompresses the second moments along dimensions with high SNR when feasible, and\nleaves when compression would be detrimental. Through experiments across a\ndiverse set of architectures and training scenarios, we show that\n$\\textit{SlimAdam}$ matches Adam's performance and stability while saving up to\n$98\\%$ of total second moments. Code for $\\textit{SlimAdam}$ is available at\nhttps://github.com/dayal-kalra/low-memory-adam.\n","authors":["Dayal Singh Kalra","John Kirchenbauer","Maissam Barkeshli","Tom Goldstein"],"pdf_url":"https://arxiv.org/pdf/2503.01843v2.pdf","comment":"Acknowledgement updates and minor writing edits"},{"id":"http://arxiv.org/abs/2503.04690v1","updated":"2025-03-06T18:32:35Z","published":"2025-03-06T18:32:35Z","title":"Coarse graining and reduced order models for plume ejection dynamics","summary":" Monitoring the atmospheric dispersion of pollutants is increasingly critical\nfor environmental impact assessments. High-fidelity computational models are\noften employed to simulate plume dynamics, guiding decision-making and\nprioritizing resource deployment. However, such models can be prohibitively\nexpensive to simulate, as they require resolving turbulent flows at fine\nspatial and temporal resolutions. Moreover, there are at least two distinct\ndynamical regimes of interest in the plume: (i) the initial ejection of the\nplume where turbulent mixing is generated by the shear-driven Kelvin-Helmholtz\ninstability, and (ii) the ensuing turbulent diffusion and advection which is\noften modeled by the Gaussian plume model. We address the challenge of modeling\nthe initial plume generation. Specifically, we propose a data-driven framework\nthat identifies a reduced-order analytical model for plume dynamics -- directly\nfrom video data. We extract a time series of plume center and edge points from\nvideo snapshots and evaluate different regressions based to their extrapolation\nperformance to generate a time series of coefficients that characterize the\nplume's overall direction and spread. We regress to a sinusoidal model inspired\nby the Kelvin-Helmholtz instability for the edge points in order to identify\nthe plume's dispersion and vorticity. Overall, this reduced-order modeling\nframework provides a data-driven and lightweight approach to capture the\ndominant features of the initial nonlinear point-source plume dynamics,\nagnostic to plume type and starting only from video. The resulting model is a\npre-cursor to standard models such as the Gaussian plume model and has the\npotential to enable rapid assessment and evaluation of critical environmental\nhazards, such as methane leaks, chemical spills, and pollutant dispersal from\nsmokestacks.\n","authors":["Ike Griss Salas","Megan R. Ebers","Jake Stevens-Haas","J. Nathan Kutz"],"pdf_url":"https://arxiv.org/pdf/2503.04690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.02800v2","updated":"2025-03-06T18:30:45Z","published":"2025-03-04T17:20:43Z","title":"RAAD-LLM: Adaptive Anomaly Detection Using LLMs and RAG Integration","summary":" Anomaly detection in complex industrial environments poses unique challenges,\nparticularly in contexts characterized by data sparsity and evolving\noperational conditions. Predictive maintenance (PdM) in such settings demands\nmethodologies that are adaptive, transferable, and capable of integrating\ndomain-specific knowledge. In this paper, we present RAAD-LLM, a novel\nframework for adaptive anomaly detection, leveraging large language models\n(LLMs) integrated with Retrieval-Augmented Generation (RAG). This approach\naddresses the aforementioned PdM challenges. By effectively utilizing\ndomain-specific knowledge, RAAD-LLM enhances the detection of anomalies in time\nseries data without requiring fine-tuning on specific datasets. The framework's\nadaptability mechanism enables it to adjust its understanding of normal\noperating conditions dynamically, thus increasing detection accuracy. We\nvalidate this methodology through a real-world application for a plastics\nmanufacturing plant and the Skoltech Anomaly Benchmark (SKAB). Results show\nsignificant improvements over our previous model with an accuracy increase from\n70.7% to 89.1% on the real-world dataset. By allowing for the enriching of\ninput series data with semantics, RAAD-LLM incorporates multimodal capabilities\nthat facilitate more collaborative decision-making between the model and plant\noperators. Overall, our findings support RAAD-LLM's ability to revolutionize\nanomaly detection methodologies in PdM, potentially leading to a paradigm shift\nin how anomaly detection is implemented across various industries.\n","authors":["Alicia Russell-Gilbert","Sudip Mittal","Shahram Rahimi","Maria Seale","Joseph Jabour","Thomas Arnold","Joshua Church"],"pdf_url":"https://arxiv.org/pdf/2503.02800v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2411.00914"},{"id":"http://arxiv.org/abs/2503.04687v1","updated":"2025-03-06T18:29:45Z","published":"2025-03-06T18:29:45Z","title":"Compositional World Knowledge leads to High Utility Synthetic data","summary":" Machine learning systems struggle with robustness, under subpopulation\nshifts. This problem becomes especially pronounced in scenarios where only a\nsubset of attribute combinations is observed during training -a severe form of\nsubpopulation shift, referred as compositional shift. To address this problem,\nwe ask the following question: Can we improve the robustness by training on\nsynthetic data, spanning all possible attribute combinations? We first show\nthat training of conditional diffusion models on limited data lead to incorrect\nunderlying distribution. Therefore, synthetic data sampled from such models\nwill result in unfaithful samples and does not lead to improve performance of\ndownstream machine learning systems. To address this problem, we propose CoInD\nto reflect the compositional nature of the world by enforcing conditional\nindependence through minimizing Fisher's divergence between joint and marginal\ndistributions. We demonstrate that synthetic data generated by CoInD is\nfaithful and this translates to state-of-the-art worst-group accuracy on\ncompositional shift tasks on CelebA.\n","authors":["Sachit Gaudi","Gautam Sreekumar","Vishnu Boddeti"],"pdf_url":"https://arxiv.org/pdf/2503.04687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04684v1","updated":"2025-03-06T18:26:42Z","published":"2025-03-06T18:26:42Z","title":"Propagating Model Uncertainty through Filtering-based Probabilistic\n Numerical ODE Solvers","summary":" Filtering-based probabilistic numerical solvers for ordinary differential\nequations (ODEs), also known as ODE filters, have been established as efficient\nmethods for quantifying numerical uncertainty in the solution of ODEs. In\npractical applications, however, the underlying dynamical system often contains\nuncertain parameters, requiring the propagation of this model uncertainty to\nthe ODE solution. In this paper, we demonstrate that ODE filters, despite their\nprobabilistic nature, do not automatically solve this uncertainty propagation\nproblem. To address this limitation, we present a novel approach that combines\nODE filters with numerical quadrature to properly marginalize over uncertain\nparameters, while accounting for both parameter uncertainty and numerical\nsolver uncertainty. Experiments across multiple dynamical systems demonstrate\nthat the resulting uncertainty estimates closely match reference solutions.\nNotably, we show how the numerical uncertainty from the ODE solver can help\nprevent overconfidence in the propagated uncertainty estimates, especially when\nusing larger step sizes. Our results illustrate that probabilistic numerical\nmethods can effectively quantify both numerical and parametric uncertainty in\ndynamical systems.\n","authors":["Dingling Yao","Filip Tronarp","Nathanael Bosch"],"pdf_url":"https://arxiv.org/pdf/2503.04684v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04680v1","updated":"2025-03-06T18:22:46Z","published":"2025-03-06T18:22:46Z","title":"Matrix Factorization for Inferring Associations and Missing Links","summary":" Missing link prediction is a method for network analysis, with applications\nin recommender systems, biology, social sciences, cybersecurity, information\nretrieval, and Artificial Intelligence (AI) reasoning in Knowledge Graphs.\nMissing link prediction identifies unseen but potentially existing connections\nin a network by analyzing the observed patterns and relationships. In\nproliferation detection, this supports efforts to identify and characterize\nattempts by state and non-state actors to acquire nuclear weapons or associated\ntechnology - a notoriously challenging but vital mission for global security.\nDimensionality reduction techniques like Non-Negative Matrix Factorization\n(NMF) and Logistic Matrix Factorization (LMF) are effective but require\nselection of the matrix rank parameter, that is, of the number of hidden\nfeatures, k, to avoid over/under-fitting. We introduce novel Weighted (WNMFk),\nBoolean (BNMFk), and Recommender (RNMFk) matrix factorization methods, along\nwith ensemble variants incorporating logistic factorization, for link\nprediction. Our methods integrate automatic model determination for rank\nestimation by evaluating stability and accuracy using a modified bootstrap\nmethodology and uncertainty quantification (UQ), assessing prediction\nreliability under random perturbations. We incorporate Otsu threshold selection\nand k-means clustering for Boolean matrix factorization, comparing them to\ncoordinate descent-based Boolean thresholding. Our experiments highlight the\nimpact of rank k selection, evaluate model performance under varying test-set\nsizes, and demonstrate the benefits of UQ for reliable predictions using\nabstention. We validate our methods on three synthetic datasets (Boolean and\nuniformly distributed) and benchmark them against LMF and symmetric LMF\n(symLMF) on five real-world protein-protein interaction networks, showcasing an\nimproved prediction performance.\n","authors":["Ryan Barron","Maksim E. Eren","Duc P. Truong","Cynthia Matuszek","James Wendelberger","Mary F. Dorn","Boian Alexandrov"],"pdf_url":"https://arxiv.org/pdf/2503.04680v1.pdf","comment":"35 pages, 14 figures, 3 tables, 1 algorithm"},{"id":"http://arxiv.org/abs/2503.04679v1","updated":"2025-03-06T18:22:29Z","published":"2025-03-06T18:22:29Z","title":"Multi-Agent Inverse Q-Learning from Demonstrations","summary":" When reward functions are hand-designed, deep reinforcement learning\nalgorithms often suffer from reward misspecification, causing them to learn\nsuboptimal policies in terms of the intended task objectives. In the\nsingle-agent case, inverse reinforcement learning (IRL) techniques attempt to\naddress this issue by inferring the reward function from expert demonstrations.\nHowever, in multi-agent problems, misalignment between the learned and true\nobjectives is exacerbated due to increased environment non-stationarity and\nvariance that scales with multiple agents. As such, in multi-agent general-sum\ngames, multi-agent IRL algorithms have difficulty balancing cooperative and\ncompetitive objectives. To address these issues, we propose Multi-Agent\nMarginal Q-Learning from Demonstrations (MAMQL), a novel sample-efficient\nframework for multi-agent IRL. For each agent, MAMQL learns a critic\nmarginalized over the other agents' policies, allowing for a well-motivated use\nof Boltzmann policies in the multi-agent context. We identify a connection\nbetween optimal marginalized critics and single-agent soft-Q IRL, allowing us\nto apply a direct, simple optimization criterion from the single-agent domain.\nAcross our experiments on three different simulated domains, MAMQL\nsignificantly outperforms previous multi-agent methods in average reward,\nsample efficiency, and reward recovery by often more than 2-5x. We make our\ncode available at https://sites.google.com/view/mamql .\n","authors":["Nathaniel Haynam","Adam Khoja","Dhruv Kumar","Vivek Myers","Erdem Bıyık"],"pdf_url":"https://arxiv.org/pdf/2503.04679v1.pdf","comment":"8 pages, 4 figures, 2 tables. Published at the International\n Conference on Robotics and Automation (ICRA) 2025"},{"id":"http://arxiv.org/abs/2410.06186v4","updated":"2025-03-06T18:20:00Z","published":"2024-10-08T16:51:10Z","title":"The Last Iterate Advantage: Empirical Auditing and Principled Heuristic\n Analysis of Differentially Private SGD","summary":" We propose a simple heuristic privacy analysis of noisy clipped stochastic\ngradient descent (DP-SGD) in the setting where only the last iterate is\nreleased and the intermediate iterates remain hidden. Namely, our heuristic\nassumes a linear structure for the model.\n We show experimentally that our heuristic is predictive of the outcome of\nprivacy auditing applied to various training procedures. Thus it can be used\nprior to training as a rough estimate of the final privacy leakage. We also\nprobe the limitations of our heuristic by providing some artificial\ncounterexamples where it underestimates the privacy leakage.\n The standard composition-based privacy analysis of DP-SGD effectively assumes\nthat the adversary has access to all intermediate iterates, which is often\nunrealistic. However, this analysis remains the state of the art in practice.\nWhile our heuristic does not replace a rigorous privacy analysis, it\nillustrates the large gap between the best theoretical upper bounds and the\nprivacy auditing lower bounds and sets a target for further work to improve the\ntheoretical privacy analyses. We also empirically support our heuristic and\nshow existing privacy auditing attacks are bounded by our heuristic analysis in\nboth vision and language tasks.\n","authors":["Thomas Steinke","Milad Nasr","Arun Ganesh","Borja Balle","Christopher A. Choquette-Choo","Matthew Jagielski","Jamie Hayes","Abhradeep Guha Thakurta","Adam Smith","Andreas Terzis"],"pdf_url":"https://arxiv.org/pdf/2410.06186v4.pdf","comment":"ICLR 2025 camera-ready version"},{"id":"http://arxiv.org/abs/2402.10065v2","updated":"2025-03-06T18:17:02Z","published":"2024-02-15T16:30:55Z","title":"Some Targets Are Harder to Identify than Others: Quantifying the\n Target-dependent Membership Leakage","summary":" In a Membership Inference (MI) game, an attacker tries to infer whether a\ntarget point was included or not in the input of an algorithm. Existing works\nshow that some target points are easier to identify, while others are harder.\nThis paper explains the target-dependent hardness of membership attacks by\nstudying the powers of the optimal attacks in a fixed-target MI game. We\ncharacterise the optimal advantage and trade-off functions of attacks against\nthe empirical mean in terms of the Mahalanobis distance between the target\npoint and the data-generating distribution. We further derive the impacts of\ntwo privacy defences, i.e. adding Gaussian noise and sub-sampling, and that of\ntarget misspecification on optimal attacks. As by-products of our novel\nanalysis of the Likelihood Ratio (LR) test, we provide a new covariance attack\nwhich generalises and improves the scalar product attack. Also, we propose a\nnew optimal canary-choosing strategy for auditing privacy in the white-box\nfederated learning setting. Our experiments validate that the Mahalanobis score\nexplains the hardness of fixed-target MI games.\n","authors":["Achraf Azize","Debabrota Basu"],"pdf_url":"https://arxiv.org/pdf/2402.10065v2.pdf","comment":"Appears in AISTATS 2025 (Oral)"},{"id":"http://arxiv.org/abs/2502.02067v2","updated":"2025-03-06T18:09:38Z","published":"2025-02-04T07:32:39Z","title":"AdaptBot: Combining LLM with Knowledge Graphs and Human Input for\n Generic-to-Specific Task Decomposition and Knowledge Refinement","summary":" An embodied agent assisting humans is often asked to complete new tasks, and\nthere may not be sufficient time or labeled examples to train the agent to\nperform these new tasks. Large Language Models (LLMs) trained on considerable\nknowledge across many domains can be used to predict a sequence of abstract\nactions for completing such tasks, although the agent may not be able to\nexecute this sequence due to task-, agent-, or domain-specific constraints. Our\nframework addresses these challenges by leveraging the generic predictions\nprovided by LLM and the prior domain knowledge encoded in a Knowledge Graph\n(KG), enabling an agent to quickly adapt to new tasks. The robot also solicits\nand uses human input as needed to refine its existing knowledge. Based on\nexperimental evaluation in the context of cooking and cleaning tasks in\nsimulation domains, we demonstrate that the interplay between LLM, KG, and\nhuman input leads to substantial performance gains compared with just using the\nLLM. Project website{\\S}: https://sssshivvvv.github.io/adaptbot/\n","authors":["Shivam Singh","Karthik Swaminathan","Nabanita Dash","Ramandeep Singh","Snehasis Banerjee","Mohan Sridharan","Madhava Krishna"],"pdf_url":"https://arxiv.org/pdf/2502.02067v2.pdf","comment":"Accepted to IEEE International Conference on Robotics and Automation\n (ICRA) 2025"},{"id":"http://arxiv.org/abs/2502.12360v2","updated":"2025-03-06T18:07:00Z","published":"2025-02-17T22:50:45Z","title":"Detecting Systematic Weaknesses in Vision Models along Predefined\n Human-Understandable Dimensions","summary":" Slice discovery methods (SDMs) are prominent algorithms for finding\nsystematic weaknesses in DNNs. They identify top-k semantically coherent\nslices/subsets of data where a DNN-under-test has low performance. For being\ndirectly useful, slices should be aligned with human-understandable and\nrelevant dimensions, which, for example, are defined by safety and domain\nexperts as part of the operational design domain (ODD). While SDMs can be\napplied effectively on structured data, their application on image data is\ncomplicated by the lack of semantic metadata. To address these issues, we\npresent an algorithm that combines foundation models for zero-shot image\nclassification to generate semantic metadata with methods for combinatorial\nsearch to find systematic weaknesses in images. In contrast to existing\napproaches, ours identifies weak slices that are in line with pre-defined\nhuman-understandable dimensions. As the algorithm includes foundation models,\nits intermediate and final results may not always be exact. Therefore, we\ninclude an approach to address the impact of noisy metadata. We validate our\nalgorithm on both synthetic and real-world datasets, demonstrating its ability\nto recover human-understandable systematic weaknesses. Furthermore, using our\napproach, we identify systematic weaknesses of multiple pre-trained and\npublicly available state-of-the-art computer vision DNNs.\n","authors":["Sujan Sai Gannamaneni","Rohil Prakash Rao","Michael Mock","Maram Akila","Stefan Wrobel"],"pdf_url":"https://arxiv.org/pdf/2502.12360v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04667v1","updated":"2025-03-06T17:59:51Z","published":"2025-03-06T17:59:51Z","title":"An Information-theoretic Multi-task Representation Learning Framework\n for Natural Language Understanding","summary":" This paper proposes a new principled multi-task representation learning\nframework (InfoMTL) to extract noise-invariant sufficient representations for\nall tasks. It ensures sufficiency of shared representations for all tasks and\nmitigates the negative effect of redundant features, which can enhance language\nunderstanding of pre-trained language models (PLMs) under the multi-task\nparadigm. Firstly, a shared information maximization principle is proposed to\nlearn more sufficient shared representations for all target tasks. It can avoid\nthe insufficiency issue arising from representation compression in the\nmulti-task paradigm. Secondly, a task-specific information minimization\nprinciple is designed to mitigate the negative effect of potential redundant\nfeatures in the input for each task. It can compress task-irrelevant redundant\ninformation and preserve necessary information relevant to the target for\nmulti-task prediction. Experiments on six classification benchmarks show that\nour method outperforms 12 comparative multi-task methods under the same\nmulti-task settings, especially in data-constrained and noisy scenarios.\nExtensive experiments demonstrate that the learned representations are more\nsufficient, data-efficient, and robust.\n","authors":["Dou Hu","Lingwei Wei","Wei Zhou","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2503.04667v1.pdf","comment":"11 pages, accepted to AAAI 2025 (main conference), the code is\n available at https://github.com/zerohd4869/InfoMTL"},{"id":"http://arxiv.org/abs/2503.04655v1","updated":"2025-03-06T17:49:13Z","published":"2025-03-06T17:49:13Z","title":"CLDyB: Towards Dynamic Benchmarking for Continual Learning with\n Pre-trained Models","summary":" The advent of the foundation model era has sparked significant research\ninterest in leveraging pre-trained representations for continual learning (CL),\nyielding a series of top-performing CL methods on standard evaluation\nbenchmarks. Nonetheless, there are growing concerns regarding potential data\ncontamination during the pre-training stage. Furthermore, standard evaluation\nbenchmarks, which are typically static, fail to capture the complexities of\nreal-world CL scenarios, resulting in saturated performance. To address these\nissues, we describe CL on dynamic benchmarks (CLDyB), a general computational\nframework based on Markov decision processes for evaluating CL methods\nreliably. CLDyB dynamically identifies inherently difficult and\nalgorithm-dependent tasks for the given CL methods, and determines challenging\ntask orders using Monte Carlo tree search. Leveraging CLDyB, we first conduct a\njoint evaluation of multiple state-of-the-art CL methods, leading to a set of\ncommonly challenging and generalizable task sequences where existing CL methods\ntend to perform poorly. We then conduct separate evaluations of individual CL\nmethods using CLDyB, discovering their respective strengths and weaknesses. The\nsource code and generated task sequences are publicly accessible at\nhttps://github.com/szc12153/CLDyB.\n","authors":["Shengzhuang Chen","Yikai Liao","Xiaoxiao Sun","Kede Ma","Ying Wei"],"pdf_url":"https://arxiv.org/pdf/2503.04655v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04650v1","updated":"2025-03-06T17:39:12Z","published":"2025-03-06T17:39:12Z","title":"Joint Masked Reconstruction and Contrastive Learning for Mining\n Interactions Between Proteins","summary":" Protein-protein interaction (PPI) prediction is an instrumental means in\nelucidating the mechanisms underlying cellular operations, holding significant\npractical implications for the realms of pharmaceutical development and\nclinical treatment. Presently, the majority of research methods primarily\nconcentrate on the analysis of amino acid sequences, while investigations\npredicated on protein structures remain in the nascent stages of exploration.\nDespite the emergence of several structure-based algorithms in recent years,\nthese are still confronted with inherent challenges: (1) the extraction of\nintrinsic structural information of proteins typically necessitates the\nexpenditure of substantial computational resources; (2) these models are overly\nreliant on seen protein data, struggling to effectively unearth interaction\ncues between unknown proteins. To further propel advancements in this domain,\nthis paper introduces a novel PPI prediction method jointing masked\nreconstruction and contrastive learning, termed JmcPPI. This methodology\ndissects the PPI prediction task into two distinct phases: during the residue\nstructure encoding phase, JmcPPI devises two feature reconstruction tasks and\nemploys graph attention mechanism to capture structural information between\nresidues; during the protein interaction inference phase, JmcPPI perturbs the\noriginal PPI graph and employs a multi-graph contrastive learning strategy to\nthoroughly mine extrinsic interaction information of novel proteins. Extensive\nexperiments conducted on three widely utilized PPI datasets demonstrate that\nJmcPPI surpasses existing optimal baseline models across various data partition\nschemes. The associated code can be accessed via\nhttps://github.com/lijfrank-open/JmcPPI.\n","authors":["Jiang Li","Xiaoping Wang"],"pdf_url":"https://arxiv.org/pdf/2503.04650v1.pdf","comment":"Submitted"},{"id":"http://arxiv.org/abs/2503.04649v1","updated":"2025-03-06T17:35:37Z","published":"2025-03-06T17:35:37Z","title":"Transferable Foundation Models for Geometric Tasks on Point Cloud\n Representations: Geometric Neural Operators","summary":" We introduce methods for obtaining pretrained Geometric Neural Operators\n(GNPs) that can serve as basal foundation models for use in obtaining geometric\nfeatures. These can be used within data processing pipelines for machine\nlearning tasks and numerical methods. We show how our GNPs can be trained to\nlearn robust latent representations for the differential geometry of\npoint-clouds to provide estimates of metric, curvature, and other shape-related\nfeatures. We demonstrate how our pre-trained GNPs can be used (i) to estimate\nthe geometric properties of surfaces of arbitrary shape and topologies with\nrobustness in the presence of noise, (ii) to approximate solutions of geometric\npartial differential equations (PDEs) on manifolds, and (iii) to solve\nequations for shape deformations such as curvature driven flows. We also\nrelease a package of the codes and weights for using our pre-trained GNPs for\nprocessing point cloud representations. This allows for incorporating our\npre-trained GNPs as components for reuse within existing and new data\nprocessing pipelines. The GNPs also can be used as part of numerical solvers\ninvolving geometry or as part of methods for performing inference and other\ngeometric tasks.\n","authors":["Blaine Quackenbush","Paul J. Atzberger"],"pdf_url":"https://arxiv.org/pdf/2503.04649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04873v2","updated":"2025-03-06T17:35:19Z","published":"2025-01-08T23:07:10Z","title":"Back Home: A Machine Learning Approach to Seashell Classification and\n Ecosystem Restoration","summary":" In Costa Rica, an average of 5 tons of seashells are extracted from\necosystems annually. Confiscated seashells, cannot be returned to their\necosystems due to the lack of origin recognition. To address this issue, we\ndeveloped a convolutional neural network (CNN) specifically for seashell\nidentification. We built a dataset from scratch, consisting of approximately\n19000 images from the Pacific and Caribbean coasts. Using this dataset, the\nmodel achieved a classification accuracy exceeding 85%. The model has been\nintegrated into a user-friendly application, which has classified over 36,000\nseashells to date, delivering real-time results within 3 seconds per image. To\nfurther enhance the system's accuracy, an anomaly detection mechanism was\nincorporated to filter out irrelevant or anomalous inputs, ensuring only valid\nseashell images are processed.\n","authors":["Alexander Valverde","Luis Solano"],"pdf_url":"https://arxiv.org/pdf/2501.04873v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04641v1","updated":"2025-03-06T17:31:43Z","published":"2025-03-06T17:31:43Z","title":"Simulating the Real World: A Unified Survey of Multimodal Generative\n Models","summary":" Understanding and replicating the real world is a critical challenge in\nArtificial General Intelligence (AGI) research. To achieve this, many existing\napproaches, such as world models, aim to capture the fundamental principles\ngoverning the physical world, enabling more accurate simulations and meaningful\ninteractions. However, current methods often treat different modalities,\nincluding 2D (images), videos, 3D, and 4D representations, as independent\ndomains, overlooking their interdependencies. Additionally, these methods\ntypically focus on isolated dimensions of reality without systematically\nintegrating their connections. In this survey, we present a unified survey for\nmultimodal generative models that investigate the progression of data\ndimensionality in real-world simulation. Specifically, this survey starts from\n2D generation (appearance), then moves to video (appearance+dynamics) and 3D\ngeneration (appearance+geometry), and finally culminates in 4D generation that\nintegrate all dimensions. To the best of our knowledge, this is the first\nattempt to systematically unify the study of 2D, video, 3D and 4D generation\nwithin a single framework. To guide future research, we provide a comprehensive\nreview of datasets, evaluation metrics and future directions, and fostering\ninsights for newcomers. This survey serves as a bridge to advance the study of\nmultimodal generative models and real-world simulation within a unified\nframework.\n","authors":["Yuqi Hu","Longguang Wang","Xian Liu","Ling-Hao Chen","Yuwei Guo","Yukai Shi","Ce Liu","Anyi Rao","Zeyu Wang","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2503.04641v1.pdf","comment":"Repository for the related papers at\n https://github.com/ALEEEHU/World-Simulator"},{"id":"http://arxiv.org/abs/2503.04639v1","updated":"2025-03-06T17:28:48Z","published":"2025-03-06T17:28:48Z","title":"Enhancing SAM with Efficient Prompting and Preference Optimization for\n Semi-supervised Medical Image Segmentation","summary":" Foundational models such as the Segment Anything Model (SAM) are gaining\ntraction in medical imaging segmentation, supporting multiple downstream tasks.\nHowever, such models are supervised in nature, still relying on large annotated\ndatasets or prompts supplied by experts. Conventional techniques such as active\nlearning to alleviate such limitations are limited in scope and still\nnecessitate continuous human involvement and complex domain knowledge for label\nrefinement or establishing reward ground truth. To address these challenges, we\npropose an enhanced Segment Anything Model (SAM) framework that utilizes\nannotation-efficient prompts generated in a fully unsupervised fashion, while\nstill capturing essential semantic, location, and shape information through\ncontrastive language-image pretraining and visual question answering. We adopt\nthe direct preference optimization technique to design an optimal policy that\nenables the model to generate high-fidelity segmentations with simple ratings\nor rankings provided by a virtual annotator simulating the human annotation\nprocess. State-of-the-art performance of our framework in tasks such as lung\nsegmentation, breast tumor segmentation, and organ segmentation across various\nmodalities, including X-ray, ultrasound, and abdominal CT, justifies its\neffectiveness in low-annotation data scenarios.\n","authors":["Aishik Konwer","Zhijian Yang","Erhan Bas","Cao Xiao","Prateek Prasanna","Parminder Bhatia","Taha Kass-Hout"],"pdf_url":"https://arxiv.org/pdf/2503.04639v1.pdf","comment":"Accepted to CVPR 2025"},{"id":"http://arxiv.org/abs/2503.04638v1","updated":"2025-03-06T17:25:46Z","published":"2025-03-06T17:25:46Z","title":"No Forgetting Learning: Memory-free Continual Learning","summary":" Continual Learning (CL) remains a central challenge in deep learning, where\nmodels must sequentially acquire new knowledge while mitigating Catastrophic\nForgetting (CF) of prior tasks. Existing approaches often struggle with\nefficiency and scalability, requiring extensive memory or model buffers. This\nwork introduces ``No Forgetting Learning\" (NFL), a memory-free CL framework\nthat leverages knowledge distillation to maintain stability while preserving\nplasticity. Memory-free means the NFL does not rely on any memory buffer.\nThrough extensive evaluations of three benchmark datasets, we demonstrate that\nNFL achieves competitive performance while utilizing approximately 14.75 times\nless memory than state-of-the-art methods. Furthermore, we introduce a new\nmetric to better assess CL's plasticity-stability trade-off.\n","authors":["Mohammad Ali Vahedifar","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.04638v1.pdf","comment":"This paper is submitted to ICCV 2025"},{"id":"http://arxiv.org/abs/2202.00665v4","updated":"2025-03-06T17:24:46Z","published":"2022-02-01T18:58:33Z","title":"Tutorial on amortized optimization","summary":" Optimization is a ubiquitous modeling tool and is often deployed in settings\nwhich repeatedly solve similar instances of the same problem. Amortized\noptimization methods use learning to predict the solutions to problems in these\nsettings, exploiting the shared structure between similar problem instances.\nThese methods have been crucial in variational inference and reinforcement\nlearning and are capable of solving optimization problems many orders of\nmagnitudes times faster than traditional optimization methods that do not use\namortization. This tutorial presents an introduction to the amortized\noptimization foundations behind these advancements and overviews their\napplications in variational inference, sparse coding, gradient-based\nmeta-learning, control, reinforcement learning, convex optimization, optimal\ntransport, and deep equilibrium networks. The source code for this tutorial is\navailable at\nhttps://github.com/facebookresearch/amortized-optimization-tutorial.\n","authors":["Brandon Amos"],"pdf_url":"https://arxiv.org/pdf/2202.00665v4.pdf","comment":"Foundations and Trends in Machine Learning"},{"id":"http://arxiv.org/abs/2503.04636v1","updated":"2025-03-06T17:24:06Z","published":"2025-03-06T17:24:06Z","title":"Mark Your LLM: Detecting the Misuse of Open-Source Large Language Models\n via Watermarking","summary":" As open-source large language models (LLMs) like Llama3 become more capable,\nit is crucial to develop watermarking techniques to detect their potential\nmisuse. Existing watermarking methods either add watermarks during LLM\ninference, which is unsuitable for open-source LLMs, or primarily target\nclassification LLMs rather than recent generative LLMs. Adapting these\nwatermarks to open-source LLMs for misuse detection remains an open challenge.\nThis work defines two misuse scenarios for open-source LLMs: intellectual\nproperty (IP) violation and LLM Usage Violation. Then, we explore the\napplication of inference-time watermark distillation and backdoor watermarking\nin these contexts. We propose comprehensive evaluation methods to assess the\nimpact of various real-world further fine-tuning scenarios on watermarks and\nthe effect of these watermarks on LLM performance. Our experiments reveal that\nbackdoor watermarking could effectively detect IP Violation, while\ninference-time watermark distillation is applicable in both scenarios but less\nrobust to further fine-tuning and has a more significant impact on LLM\nperformance compared to backdoor watermarking. Exploring more advanced\nwatermarking methods for open-source LLMs to detect their misuse should be an\nimportant future direction.\n","authors":["Yijie Xu","Aiwei Liu","Xuming Hu","Lijie Wen","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2503.04636v1.pdf","comment":"Accepted by the 1st Workshop on GenAI Watermarking, collocated with\n ICLR 2025"},{"id":"http://arxiv.org/abs/2503.00897v3","updated":"2025-03-06T17:19:22Z","published":"2025-03-02T13:43:53Z","title":"A Simple and Effective Reinforcement Learning Method for Text-to-Image\n Diffusion Fine-tuning","summary":" Reinforcement learning (RL)-based fine-tuning has emerged as a powerful\napproach for aligning diffusion models with black-box objectives. Proximal\npolicy optimization (PPO) is the most popular choice of method for policy\noptimization. While effective in terms of performance, PPO is highly sensitive\nto hyper-parameters and involves substantial computational overhead. REINFORCE,\non the other hand, mitigates some computational complexities such as high\nmemory overhead and sensitive hyper-parameter tuning, but has suboptimal\nperformance due to high-variance and sample inefficiency. While the variance of\nthe REINFORCE can be reduced by sampling multiple actions per input prompt and\nusing a baseline correction term, it still suffers from sample inefficiency. To\naddress these challenges, we systematically analyze the\nefficiency-effectiveness trade-off between REINFORCE and PPO, and propose\nleave-one-out PPO (LOOP), a novel RL for diffusion fine-tuning method. LOOP\ncombines variance reduction techniques from REINFORCE, such as sampling\nmultiple actions per input prompt and a baseline correction term, with the\nrobustness and sample efficiency of PPO via clipping and importance sampling.\nOur results demonstrate that LOOP effectively improves diffusion models on\nvarious black-box objectives, and achieves a better balance between\ncomputational efficiency and performance.\n","authors":["Shashank Gupta","Chaitanya Ahuja","Tsung-Yu Lin","Sreya Dutta Roy","Harrie Oosterhuis","Maarten de Rijke","Satya Narayan Shukla"],"pdf_url":"https://arxiv.org/pdf/2503.00897v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04626v1","updated":"2025-03-06T17:12:46Z","published":"2025-03-06T17:12:46Z","title":"IDInit: A Universal and Stable Initialization Method for Neural Network\n Training","summary":" Deep neural networks have achieved remarkable accomplishments in practice.\nThe success of these networks hinges on effective initialization methods, which\nare vital for ensuring stable and rapid convergence during training. Recently,\ninitialization methods that maintain identity transition within layers have\nshown good efficiency in network training. These techniques (e.g., Fixup) set\nspecific weights to zero to achieve identity control. However, settings of\nremaining weight (e.g., Fixup uses random values to initialize non-zero\nweights) will affect the inductive bias that is achieved only by a zero weight,\nwhich may be harmful to training. Addressing this concern, we introduce fully\nidentical initialization (IDInit), a novel method that preserves identity in\nboth the main and sub-stem layers of residual networks. IDInit employs a padded\nidentity-like matrix to overcome rank constraints in non-square weight\nmatrices. Furthermore, we show the convergence problem of an identity matrix\ncan be solved by stochastic gradient descent. Additionally, we enhance the\nuniversality of IDInit by processing higher-order weights and addressing dead\nneuron problems. IDInit is a straightforward yet effective initialization\nmethod, with improved convergence, stability, and performance across various\nsettings, including large-scale datasets and deep models.\n","authors":["Yu Pan","Chaozheng Wang","Zekai Wu","Qifan Wang","Min Zhang","Zenglin Xu"],"pdf_url":"https://arxiv.org/pdf/2503.04626v1.pdf","comment":"Accepted in ICLR 2025"},{"id":"http://arxiv.org/abs/2410.05116v2","updated":"2025-03-06T17:11:55Z","published":"2024-10-07T15:12:01Z","title":"Human-Feedback Efficient Reinforcement Learning for Online Diffusion\n Model Finetuning","summary":" Controllable generation through Stable Diffusion (SD) fine-tuning aims to\nimprove fidelity, safety, and alignment with human guidance. Existing\nreinforcement learning from human feedback methods usually rely on predefined\nheuristic reward functions or pretrained reward models built on large-scale\ndatasets, limiting their applicability to scenarios where collecting such data\nis costly or difficult. To effectively and efficiently utilize human feedback,\nwe develop a framework, HERO, which leverages online human feedback collected\non the fly during model learning. Specifically, HERO features two key\nmechanisms: (1) Feedback-Aligned Representation Learning, an online training\nmethod that captures human feedback and provides informative learning signals\nfor fine-tuning, and (2) Feedback-Guided Image Generation, which involves\ngenerating images from SD's refined initialization samples, enabling faster\nconvergence towards the evaluator's intent. We demonstrate that HERO is 4x more\nefficient in online feedback for body part anomaly correction compared to the\nbest existing method. Additionally, experiments show that HERO can effectively\nhandle tasks like reasoning, counting, personalization, and reducing NSFW\ncontent with only 0.5K online feedback.\n","authors":["Ayano Hiranaka","Shang-Fu Chen","Chieh-Hsin Lai","Dongjun Kim","Naoki Murata","Takashi Shibuya","Wei-Hsiang Liao","Shao-Hua Sun","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2410.05116v2.pdf","comment":"Published in International Conference on Learning Representations\n (ICLR) 2025"},{"id":"http://arxiv.org/abs/2412.16561v2","updated":"2025-03-06T17:04:11Z","published":"2024-12-21T10:07:40Z","title":"A learning-based approach to stochastic optimal control under\n reach-avoid constraint","summary":" We develop a model-free approach to optimally control stochastic, Markovian\nsystems subject to a reach-avoid constraint. Specifically, the state trajectory\nmust remain within a safe set while reaching a target set within a finite time\nhorizon. Due to the time-dependent nature of these constraints, we show that,\nin general, the optimal policy for this constrained stochastic control problem\nis non-Markovian, which increases the computational complexity. To address this\nchallenge, we apply the state-augmentation technique from arXiv:2402.19360,\nreformulating the problem as a constrained Markov decision process (CMDP) on an\nextended state space. This transformation allows us to search for a Markovian\npolicy, avoiding the complexity of non-Markovian policies. To learn the optimal\npolicy without a system model, and using only trajectory data, we develop a\nlog-barrier policy gradient approach. We prove that under suitable assumptions,\nthe policy parameters converge to the optimal parameters, while ensuring that\nthe system trajectories satisfy the stochastic reach-avoid constraint with high\nprobability.\n","authors":["Tingting Ni","Maryam Kamgarpour"],"pdf_url":"https://arxiv.org/pdf/2412.16561v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04606v1","updated":"2025-03-06T16:53:14Z","published":"2025-03-06T16:53:14Z","title":"The Best of Both Worlds: Integrating Language Models and Diffusion\n Models for Video Generation","summary":" Recent advancements in text-to-video (T2V) generation have been driven by two\ncompeting paradigms: autoregressive language models and diffusion models.\nHowever, each paradigm has intrinsic limitations: language models struggle with\nvisual quality and error accumulation, while diffusion models lack semantic\nunderstanding and causal modeling. In this work, we propose LanDiff, a hybrid\nframework that synergizes the strengths of both paradigms through\ncoarse-to-fine generation. Our architecture introduces three key innovations:\n(1) a semantic tokenizer that compresses 3D visual features into compact 1D\ndiscrete representations through efficient semantic compression, achieving a\n$\\sim$14,000$\\times$ compression ratio; (2) a language model that generates\nsemantic tokens with high-level semantic relationships; (3) a streaming\ndiffusion model that refines coarse semantics into high-fidelity videos.\nExperiments show that LanDiff, a 5B model, achieves a score of 85.43 on the\nVBench T2V benchmark, surpassing the state-of-the-art open-source models\nHunyuan Video (13B) and other commercial models such as Sora, Keling, and\nHailuo. Furthermore, our model also achieves state-of-the-art performance in\nlong video generation, surpassing other open-source models in this field. Our\ndemo can be viewed at https://landiff.github.io/.\n","authors":["Aoxiong Yin","Kai Shen","Yichong Leng","Xu Tan","Xinyu Zhou","Juncheng Li","Siliang Tang"],"pdf_url":"https://arxiv.org/pdf/2503.04606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03770v1","updated":"2025-03-06T16:42:53Z","published":"2025-03-06T16:42:53Z","title":"Fusion of Various Optimization Based Feature Smoothing Methods for\n Wearable and Non-invasive Blood Glucose Estimation","summary":" Recently, the wearable and non-invasive blood glucose estimation approach has\nbeen proposed. However, due to the unreliability of the acquisition device, the\npresence of the noise and the variations of the acquisition environments, the\nobtained features and the reference blood glucose values are highly unreliable.\nTo address this issue, this paper proposes a polynomial fitting approach to\nsmooth the obtained features or the reference blood glucose values. First, the\nblood glucose values are estimated based on the individual optimization\napproaches. Second, the absolute difference values between the estimated blood\nglucose values and the actual blood glucose values based on each optimization\napproach are computed. Third, these absolute difference values for each\noptimization approach are sorted in the ascending order. Fourth, for each\nsorted blood glucose value, the optimization method corresponding to the\nminimum absolute difference value is selected. Fifth, the accumulate\nprobability of each selected optimization method is computed. If the accumulate\nprobability of any selected optimization method at a point is greater than a\nthreshold value, then the accumulate probabilities of these three selected\noptimization methods at that point are reset to zero. A range of the sorted\nblood glucose values are defined as that with the corresponding boundaries\npoints being the previous reset point and this reset point. Hence, after\nperforming the above procedures for all the sorted reference blood glucose\nvalues in the validation set, the regions of the sorted reference blood glucose\nvalues and the corresponding optimization methods in these regions are\ndetermined. The computer numerical simulation results show that our proposed\nmethod yields the mean absolute relative deviation (MARD) at 0.0930 and the\npercentage of the test data falling in the zone A of the Clarke error grid at\n94.1176%.\n","authors":["Yiting Wei","Bingo Wing-Kuen Ling","Danni Chen","Yuheng Dai","Qing Liu"],"pdf_url":"https://arxiv.org/pdf/2503.03770v1.pdf","comment":"This version corrects several typos"},{"id":"http://arxiv.org/abs/2503.04598v1","updated":"2025-03-06T16:40:48Z","published":"2025-03-06T16:40:48Z","title":"HybridNorm: Towards Stable and Efficient Transformer Training via Hybrid\n Normalization","summary":" Transformers have become the de facto architecture for a wide range of\nmachine learning tasks, particularly in large language models (LLMs). Despite\ntheir remarkable performance, challenges remain in training deep transformer\nnetworks, especially regarding the location of layer normalization. While\nPre-Norm structures facilitate easier training due to their more prominent\nidentity path, they often yield suboptimal performance compared to Post-Norm.\nIn this paper, we propose $\\textbf{HybridNorm}$, a straightforward yet\neffective hybrid normalization strategy that integrates the advantages of both\nPre-Norm and Post-Norm approaches. Specifically, HybridNorm employs QKV\nnormalization within the attention mechanism and Post-Norm in the feed-forward\nnetwork (FFN) of each transformer block. This design not only stabilizes\ntraining but also enhances performance, particularly in the context of LLMs.\nComprehensive experiments in both dense and sparse architectures show that\nHybridNorm consistently outperforms both Pre-Norm and Post-Norm approaches,\nachieving state-of-the-art results across various benchmarks. These findings\nhighlight the potential of HybridNorm as a more stable and effective technique\nfor improving the training and performance of deep transformer models. %Code\nwill be made publicly available. Code is available at\nhttps://github.com/BryceZhuo/HybridNorm.\n","authors":["Zhijian Zhuo","Yutao Zeng","Ya Wang","Sijun Zhang","Jian Yang","Xiaoqing Li","Xun Zhou","Jinwen Ma"],"pdf_url":"https://arxiv.org/pdf/2503.04598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17412v3","updated":"2025-03-06T16:22:22Z","published":"2024-05-27T17:57:12Z","title":"Towards One Model for Classical Dimensionality Reduction: A\n Probabilistic Perspective on UMAP and t-SNE","summary":" This paper shows that dimensionality reduction methods such as UMAP and\nt-SNE, can be approximately recast as MAP inference methods corresponding to a\nmodel introduced in ProbDR, that describes the graph Laplacian (an estimate of\nthe data precision matrix) using a Wishart distribution, with a mean given by a\nnon-linear covariance function evaluated on the latents. This interpretation\noffers deeper theoretical and semantic insights into such algorithms, by\nshowing that variances corresponding to these covariances are low (potentially\nmisspecified), and forging a connection to Gaussian process latent variable\nmodels by showing that well-known kernels can be used to describe covariances\nimplied by graph Laplacians. We also introduce tools with which similar\ndimensionality reduction methods can be studied.\n","authors":["Aditya Ravuri","Neil D. Lawrence"],"pdf_url":"https://arxiv.org/pdf/2405.17412v3.pdf","comment":"Updated preprint"},{"id":"http://arxiv.org/abs/2503.04585v1","updated":"2025-03-06T16:22:19Z","published":"2025-03-06T16:22:19Z","title":"Advancing Solutions for the Three-Body Problem Through Physics-Informed\n Neural Networks","summary":" First formulated by Sir Isaac Newton in his work \"Philosophiae Naturalis\nPrincipia Mathematica\", the concept of the Three-Body Problem was put forth as\na study of the motion of the three celestial bodies within the Earth-Sun-Moon\nsystem. In a generalized definition, it seeks to predict the motion for an\nisolated system composed of three point masses freely interacting under\nNewton's law of universal attraction. This proves to be analogous to a\nmultitude of interactions between celestial bodies, and thus, the problem finds\napplicability within the studies of celestial mechanics. Despite numerous\nattempts by renowned physicists to solve it throughout the last three\ncenturies, no general closed-form solutions have been reached due to its\ninherently chaotic nature for most initial conditions. Current state-of-the-art\nsolutions are based on two approaches, either numerical high-precision\nintegration or machine learning-based. Notwithstanding the breakthroughs of\nneural networks, these present a significant limitation, which is their\nignorance of any prior knowledge of the chaotic systems presented. Thus, in\nthis work, we propose a novel method that utilizes Physics-Informed Neural\nNetworks (PINNs). These deep neural networks are able to incorporate any prior\nsystem knowledge expressible as an Ordinary Differential Equation (ODE) into\ntheir learning processes as a regularizing agent. Our findings showcase that\nPINNs surpass current state-of-the-art machine learning methods with comparable\nprediction quality. Despite a better prediction quality, the usability of\nnumerical integrators suffers due to their prohibitively high computational\ncost. These findings confirm that PINNs are both effective and time-efficient\nopen-form solvers of the Three-Body Problem that capitalize on the extensive\nknowledge we hold of classical mechanics.\n","authors":["Manuel Santos Pereira","Luís Tripa","Nélson Lima","Francisco Caldas","Cláudia Soares"],"pdf_url":"https://arxiv.org/pdf/2503.04585v1.pdf","comment":"14 pages, 25 figures, 3 tables. 75th International Astronautical\n Congress (IAC), Milan, Italy, 14-18 October"},{"id":"http://arxiv.org/abs/2503.04582v1","updated":"2025-03-06T16:20:25Z","published":"2025-03-06T16:20:25Z","title":"PSDNorm: Test-Time Temporal Normalization for Deep Learning on EEG\n Signals","summary":" Distribution shift poses a significant challenge in machine learning,\nparticularly in biomedical applications such as EEG signals collected across\ndifferent subjects, institutions, and recording devices. While existing\nnormalization layers, Batch-Norm, LayerNorm and InstanceNorm, help address\ndistribution shifts, they fail to capture the temporal dependencies inherent in\ntemporal signals. In this paper, we propose PSDNorm, a layer that leverages\nMonge mapping and temporal context to normalize feature maps in deep learning\nmodels. Notably, the proposed method operates as a test-time domain adaptation\ntechnique, addressing distribution shifts without additional training.\nEvaluations on 10 sleep staging datasets using the U-Time model demonstrate\nthat PSDNorm achieves state-of-the-art performance at test time on datasets not\nseen during training while being 4x more data-efficient than the best baseline.\nAdditionally, PSDNorm provides a significant improvement in robustness,\nachieving markedly higher F1 scores for the 20% hardest subjects.\n","authors":["Théo Gnassounou","Antoine Collas","Rémi Flamary","Alexandre Gramfort"],"pdf_url":"https://arxiv.org/pdf/2503.04582v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.17504v2","updated":"2025-03-06T16:14:45Z","published":"2025-02-21T19:22:10Z","title":"Protein Large Language Models: A Comprehensive Survey","summary":" Protein-specific large language models (Protein LLMs) are revolutionizing\nprotein science by enabling more efficient protein structure prediction,\nfunction annotation, and design. While existing surveys focus on specific\naspects or applications, this work provides the first comprehensive overview of\nProtein LLMs, covering their architectures, training datasets, evaluation\nmetrics, and diverse applications. Through a systematic analysis of over 100\narticles, we propose a structured taxonomy of state-of-the-art Protein LLMs,\nanalyze how they leverage large-scale protein sequence data for improved\naccuracy, and explore their potential in advancing protein engineering and\nbiomedical research. Additionally, we discuss key challenges and future\ndirections, positioning Protein LLMs as essential tools for scientific\ndiscovery in protein science. Resources are maintained at\nhttps://github.com/Yijia-Xiao/Protein-LLM-Survey.\n","authors":["Yijia Xiao","Wanjia Zhao","Junkai Zhang","Yiqiao Jin","Han Zhang","Zhicheng Ren","Renliang Sun","Haixin Wang","Guancheng Wan","Pan Lu","Xiao Luo","Yu Zhang","James Zou","Yizhou Sun","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2502.17504v2.pdf","comment":"24 pages, 4 figures, 5 tables"},{"id":"http://arxiv.org/abs/2412.07093v2","updated":"2025-03-06T16:14:01Z","published":"2024-12-10T01:21:56Z","title":"Streaming Private Continual Counting via Binning","summary":" In differential privacy, $\\textit{continual observation}$ refers to problems\nin which we wish to continuously release a function of a dataset that is\nrevealed one element at a time. The challenge is to maintain a good\napproximation while keeping the combined output over all time steps\ndifferentially private. In the special case of $\\textit{continual counting}$ we\nseek to approximate a sum of binary input elements. This problem has received\nconsiderable attention lately, in part due to its relevance in implementations\nof differentially private stochastic gradient descent. $\\textit{Factorization\nmechanisms}$ are the leading approach to continual counting, but the best such\nmechanisms do not work well in $\\textit{streaming}$ settings since they require\nspace proportional to the size of the input. In this paper, we present a simple\napproach to approximating factorization mechanisms in low space via\n$\\textit{binning}$, where adjacent matrix entries with similar values are\nchanged to be identical in such a way that a matrix-vector product can be\nmaintained in sublinear space. Our approach has provable sublinear space\nguarantees for a class of lower triangular matrices whose entries are\nmonotonically decreasing away from the diagonal. We show empirically that even\nwith very low space usage we are able to closely match, and sometimes surpass,\nthe performance of asymptotically optimal factorization mechanisms. Recently,\nand independently of our work, Dvijotham et al. have also suggested an approach\nto implementing factorization mechanisms in a streaming setting. Their work\ndiffers from ours in several respects: It only addresses factorization into\n$\\textit{Toeplitz}$ matrices, only considers $\\textit{maximum}$ error, and uses\na different technique based on rational function approximation that seems less\nversatile than our binning approach.\n","authors":["Joel Daniel Andersson","Rasmus Pagh"],"pdf_url":"https://arxiv.org/pdf/2412.07093v2.pdf","comment":"Accepted to SaTML 2025. Final version to appear on IEEE eXplore"},{"id":"http://arxiv.org/abs/2503.04579v1","updated":"2025-03-06T16:13:32Z","published":"2025-03-06T16:13:32Z","title":"Data-augmented Learning of Geodesic Distances in Irregular Domains\n through Soner Boundary Conditions","summary":" Geodesic distances play a fundamental role in robotics, as they efficiently\nencode global geometric information of the domain. Recent methods use neural\nnetworks to approximate geodesic distances by solving the Eikonal equation\nthrough physics-informed approaches. While effective, these approaches often\nsuffer from unstable convergence during training in complex environments. We\npropose a framework to learn geodesic distances in irregular domains by using\nthe Soner boundary condition, and systematically evaluate the impact of data\nlosses on training stability and solution accuracy. Our experiments demonstrate\nthat incorporating data losses significantly improves convergence robustness,\nreducing training instabilities and sensitivity to initialization. These\nfindings suggest that hybrid data-physics approaches can effectively enhance\nthe reliability of learning-based geodesic distance solvers with sparse data.\n","authors":["Rafael I. Cabral Muchacho","Florian T. Pokorny"],"pdf_url":"https://arxiv.org/pdf/2503.04579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.01804v2","updated":"2025-03-06T16:07:43Z","published":"2025-03-03T18:33:46Z","title":"$\\texttt{SEM-CTRL}$: Semantically Controlled Decoding","summary":" Ensuring both syntactic and semantic correctness in Large Language Model\n(LLM) outputs remains a significant challenge, despite being critical for\nreal-world deployment. In this paper, we introduce $\\texttt{SEM-CTRL}$, a\nunified approach that enforces rich context-sensitive constraints and task- and\ninstance-specific semantics directly on an LLM decoder. Our approach integrates\ntoken-level MCTS, which is guided by specific syntactic and semantic\nconstraints. The constraints over the desired outputs are expressed using\nAnswer Set Grammars -- a logic-based formalism that generalizes\ncontext-sensitive grammars while incorporating background knowledge to\nrepresent task-specific semantics. We show that our approach guarantees correct\ncompletions for any off-the-shelf LLM without the need for fine-tuning. We\nevaluate $\\texttt{SEM-CTRL}$ on a range of tasks, including synthetic grammar\nsynthesis, combinatorial reasoning, and planning. Our results demonstrate that\n$\\texttt{SEM-CTRL}$ allows small pre-trained LLMs to efficiently outperform\nlarger variants and state-of-the-art reasoning models (e.g., o1-preview) while\nsimultaneously guaranteeing solution correctness.\n","authors":["Mohammad Albinhassan","Pranava Madhyastha","Alessandra Russo"],"pdf_url":"https://arxiv.org/pdf/2503.01804v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04570v1","updated":"2025-03-06T16:05:29Z","published":"2025-03-06T16:05:29Z","title":"Meta Learning not to Learn: Robustly Informing Meta-Learning under\n Nuisance-Varying Families","summary":" In settings where both spurious and causal predictors are available, standard\nneural networks trained under the objective of empirical risk minimization\n(ERM) with no additional inductive biases tend to have a dependence on a\nspurious feature. As a result, it is necessary to integrate additional\ninductive biases in order to guide the network toward generalizable hypotheses.\nOften these spurious features are shared across related tasks, such as\nestimating disease prognoses from image scans coming from different hospitals,\nmaking the challenge of generalization more difficult. In these settings, it is\nimportant that methods are able to integrate the proper inductive biases to\ngeneralize across both nuisance-varying families as well as task families.\nMotivated by this setting, we present RIME (Robustly Informed Meta lEarning), a\nnew method for meta learning under the presence of both positive and negative\ninductive biases (what to learn and what not to learn). We first develop a\ntheoretical causal framework showing why existing approaches at knowledge\nintegration can lead to worse performance on distributionally robust\nobjectives. We then show that RIME is able to simultaneously integrate both\nbiases, reaching state of the art performance under distributionally robust\nobjectives in informed meta-learning settings under nuisance-varying families.\n","authors":["Louis McConnell"],"pdf_url":"https://arxiv.org/pdf/2503.04570v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.18049v2","updated":"2025-03-06T16:03:59Z","published":"2025-02-25T10:15:16Z","title":"Golden Ratio Weighting Prevents Model Collapse","summary":" Recent studies identified an intriguing phenomenon in recursive generative\nmodel training known as model collapse, where models trained on data generated\nby previous models exhibit severe performance degradation. Addressing this\nissue and developing more effective training strategies have become central\nchallenges in generative model research. In this paper, we investigate this\nphenomenon theoretically within a novel framework, where generative models are\niteratively trained on a combination of newly collected real data and synthetic\ndata from the previous training step. To develop an optimal training strategy\nfor integrating real and synthetic data, we evaluate the performance of a\nweighted training scheme in various scenarios, including Gaussian distribution\nestimation and linear regression. We theoretically characterize the impact of\nthe mixing proportion and weighting scheme of synthetic data on the final\nmodel's performance. Our key finding is that, across different settings, the\noptimal weighting scheme under different proportions of synthetic data\nasymptotically follows a unified expression, revealing a fundamental trade-off\nbetween leveraging synthetic data and generative model performance. Notably, in\nsome cases, the optimal weight assigned to real data corresponds to the\nreciprocal of the golden ratio. Finally, we validate our theoretical results on\nextensive simulated datasets and a real tabular dataset.\n","authors":["Hengzhi He","Shirong Xu","Guang Cheng"],"pdf_url":"https://arxiv.org/pdf/2502.18049v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.00153v2","updated":"2025-03-06T15:50:28Z","published":"2024-09-30T18:52:53Z","title":"Beyond Single Concept Vector: Modeling Concept Subspace in LLMs with\n Gaussian Distribution","summary":" Probing learned concepts in large language models (LLMs) is crucial for\nunderstanding how semantic knowledge is encoded internally. Training linear\nclassifiers on probing tasks is a principle approach to denote the vector of a\ncertain concept in the representation space. However, the single vector\nidentified for a concept varies with both data and training, making it less\nrobust and weakening its effectiveness in real-world applications. To address\nthis challenge, we propose an approach to approximate the subspace representing\na specific concept. Built on linear probing classifiers, we extend the concept\nvectors into Gaussian Concept Subspace (GCS). We demonstrate GCS's\neffectiveness through measuring its faithfulness and plausibility across\nmultiple LLMs with different sizes and architectures. Additionally, we use\nrepresentation intervention tasks to showcase its efficacy in real-world\napplications such as emotion steering. Experimental results indicate that GCS\nconcept vectors have the potential to balance steering performance and\nmaintaining the fluency in natural language generation tasks.\n","authors":["Haiyan Zhao","Heng Zhao","Bo Shen","Ali Payani","Fan Yang","Mengnan Du"],"pdf_url":"https://arxiv.org/pdf/2410.00153v2.pdf","comment":"Accepted by ICLR 2025"},{"id":"http://arxiv.org/abs/2501.02436v2","updated":"2025-03-06T15:49:50Z","published":"2025-01-05T04:23:21Z","title":"An Analysis Framework for Understanding Deep Neural Networks Based on\n Network Dynamics","summary":" Advancing artificial intelligence demands a deeper understanding of the\nmechanisms underlying deep learning. Here, we propose a straightforward\nanalysis framework based on the dynamics of learning models. Neurons are\ncategorized into two modes based on whether their transformation functions\npreserve order. This categorization reveals how deep neural networks (DNNs)\nmaximize information extraction by rationally allocating the proportion of\nneurons in different modes across deep layers. We further introduce the\nattraction basins of the training samples in both the sample vector space and\nthe weight vector space to characterize the generalization ability of DNNs.\nThis framework allows us to identify optimal depth and width configurations,\nproviding a unified explanation for fundamental DNN behaviors such as the \"flat\nminima effect,\" \"grokking,\" and double descent phenomena. Our analysis extends\nto networks with depths up to 100 layers.\n","authors":["Yuchen Lin","Yong Zhang","Sihan Feng","Hong Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.02436v2.pdf","comment":"12 pages, 10 figures"},{"id":"http://arxiv.org/abs/2503.04556v1","updated":"2025-03-06T15:47:19Z","published":"2025-03-06T15:47:19Z","title":"Compositional Causal Reasoning Evaluation in Language Models","summary":" Causal reasoning and compositional reasoning are two core aspirations in\ngenerative AI. Measuring the extent of these behaviors requires principled\nevaluation methods. We explore a unified perspective that considers both\nbehaviors simultaneously, termed compositional causal reasoning (CCR): the\nability to infer how causal measures compose and, equivalently, how causal\nquantities propagate through graphs. We instantiate a framework for the\nsystematic evaluation of CCR for the average treatment effect and the\nprobability of necessity and sufficiency. As proof of concept, we demonstrate\nthe design of CCR tasks for language models in the LLama, Phi, and GPT\nfamilies. On a math word problem, our framework revealed a range of\ntaxonomically distinct error patterns. Additionally, CCR errors increased with\nthe complexity of causal paths for all models except o1.\n","authors":["Jacqueline R. M. A. Maasch","Alihan Hüyük","Xinnuo Xu","Aditya V. Nori","Javier Gonzalez"],"pdf_url":"https://arxiv.org/pdf/2503.04556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18691v2","updated":"2025-03-06T15:47:01Z","published":"2024-07-26T12:16:53Z","title":"Graph Neural Networks for Virtual Sensing in Complex Systems: Addressing\n Heterogeneous Temporal Dynamics","summary":" Real-time condition monitoring is crucial for the reliable and efficient\noperation of complex systems. However, relying solely on physical sensors can\nbe limited due to their cost, placement constraints, or inability to directly\nmeasure certain critical parameters. Virtual sensing addresses these\nlimitations by leveraging readily available sensor data and system knowledge to\nestimate inaccessible parameters or infer system states. The increasing\ncomplexity of industrial systems necessitates deployments of sensors with\ndiverse modalities to provide a comprehensive understanding of system states.\nThese sensors capture data at varying frequencies to monitor both rapid and\nslowly varying system dynamics, as well as local and global state evolutions of\nthe systems. This leads to heterogeneous temporal dynamics, which, particularly\nunder varying operational end environmental conditions, pose a significant\nchallenge for accurate virtual sensing. To address this, we propose a\nHeterogeneous Temporal Graph Neural Network (HTGNN) framework. HTGNN explicitly\nmodels signals from diverse sensors and integrates operating conditions into\nthe model architecture. We evaluate HTGNN using two newly released datasets: a\nbearing dataset with diverse load conditions for bearing load prediction and a\nyear-long simulated dataset for predicting bridge live loads. Our results\ndemonstrate that HTGNN significantly outperforms established baseline methods\nin both tasks, particularly under highly varying operating conditions. These\nresults highlight HTGNN's potential as a robust and accurate virtual sensing\napproach for complex systems, paving the way for improved monitoring,\npredictive maintenance, and enhanced system performance. Our code and data are\navailable under https://github.com/EPFL-IMOS/htgnn.\n","authors":["Mengjie Zhao","Cees Taal","Stephan Baggerohr","Olga Fink"],"pdf_url":"https://arxiv.org/pdf/2407.18691v2.pdf","comment":"This paper extends our previous conference paper (Best Paper at\n European Conference of the PHM Society 2024,\n https://doi.org/10.36001/phme.2024.v8i1.3998). Accepted by Mechanical Systems\n and Signal Processing (MSSP)"},{"id":"http://arxiv.org/abs/2502.18394v4","updated":"2025-03-06T15:39:55Z","published":"2025-02-25T17:43:43Z","title":"The FFT Strikes Back: An Efficient Alternative to Self-Attention","summary":" Conventional self-attention mechanisms incur quadratic complexity, limiting\ntheir scalability on long sequences. We introduce \\textbf{FFTNet}, an adaptive\nspectral filtering framework that leverages the Fast Fourier Transform (FFT) to\nachieve global token mixing in $\\mathcal{O}(n\\log n)$ time. By transforming\ninputs into the frequency domain, FFTNet exploits the orthogonality and energy\npreservation guaranteed by Parseval's theorem to capture long-range\ndependencies efficiently. Our main theoretical contributions are 1) an adaptive\nspectral filter, 2) combining local windowing with a global FFT branch, and 3)\nrich nonlinearity introduction in both the frequency and token domains.\nExperiments on the Long Range Arena and ImageNet benchmarks validate our\ntheoretical insights and demonstrate superior performance over fixed Fourier\nand standard attention models.\n","authors":["Jacob Fein-Ashley"],"pdf_url":"https://arxiv.org/pdf/2502.18394v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.09990v2","updated":"2025-03-06T15:38:31Z","published":"2025-02-14T08:22:51Z","title":"X-Boundary: Establishing Exact Safety Boundary to Shield LLMs from\n Multi-Turn Jailbreaks without Compromising Usability","summary":" Despite the rapid development of safety alignment techniques for LLMs,\ndefending against multi-turn jailbreaks is still a challenging task. In this\npaper, we conduct a comprehensive comparison, revealing that some existing\ndefense methods can improve the robustness of LLMs against multi-turn\njailbreaks but compromise usability, i.e., reducing general capabilities or\ncausing the over-refusal problem. From the perspective of mechanism\ninterpretability of LLMs, we discover that these methods fail to establish a\nboundary that exactly distinguishes safe and harmful feature representations.\nTherefore, boundary-safe representations close to harmful representations are\ninevitably disrupted, leading to a decline in usability. To address this issue,\nwe propose X-Boundary to push harmful representations away from boundary-safe\nrepresentations and obtain an exact distinction boundary. In this way, harmful\nrepresentations can be precisely erased without disrupting safe ones.\nExperimental results show that X-Boundary achieves state-of-the-art defense\nperformance against multi-turn jailbreaks, while reducing the over-refusal rate\nby about 20% and maintaining nearly complete general capability. Furthermore,\nwe theoretically prove and empirically verify that X-Boundary can accelerate\nthe convergence process during training. Please see our code at:\nhttps://github.com/AI45Lab/X-Boundary.\n","authors":["Xiaoya Lu","Dongrui Liu","Yi Yu","Luxin Xu","Jing Shao"],"pdf_url":"https://arxiv.org/pdf/2502.09990v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.03660v2","updated":"2025-03-06T15:32:00Z","published":"2025-03-05T16:47:36Z","title":"Chunking the Critic: A Transformer-based Soft Actor-Critic with N-Step\n Returns","summary":" Soft Actor-Critic (SAC) critically depends on its critic network, which\ntypically evaluates a single state-action pair to guide policy updates. Using\nN-step returns is a common practice to reduce the bias in the target values of\nthe critic. However, using N-step returns can again introduce high variance and\nnecessitates importance sampling, often destabilizing training. Recent\nalgorithms have also explored action chunking-such as direct action repetition\nand movement primitives-to enhance exploration. In this paper, we propose a\nTransformer-based Critic Network for SAC that integrates the N-returns\nframework in a stable and efficient manner. Unlike approaches that perform\nchunking in the actor network, we feed chunked actions into the critic network\nto explore potential performance gains. Our architecture leverages the\nTransformer's ability to process sequential information, facilitating more\nrobust value estimation. Empirical results show that this method not only\nachieves efficient, stable training but also excels in sparse\nreward/multi-phase environments-traditionally a challenge for step-based\nmethods. These findings underscore the promise of combining Transformer-based\ncritics with N-returns to advance reinforcement learning performance\n","authors":["Dong Tian","Ge Li","Hongyi Zhou","Onur Celik","Gerhard Neumann"],"pdf_url":"https://arxiv.org/pdf/2503.03660v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07516v3","updated":"2025-03-06T15:30:10Z","published":"2023-12-12T18:47:12Z","title":"Learning finitely correlated states: stability of the spectral\n reconstruction","summary":" Matrix product operators allow efficient descriptions (or realizations) of\nstates on a 1D lattice. We consider the task of learning a realization of\nminimal dimension from copies of an unknown state, such that the resulting\noperator is close to the density matrix in trace norm. For finitely correlated\ntranslation-invariant states on an infinite chain, a realization of minimal\ndimension can be exactly reconstructed via linear algebra operations from the\nmarginals of a size depending on the representation dimension. We establish a\nbound on the trace norm error for an algorithm that estimates a candidate\nrealization from estimates of these marginals and outputs a matrix product\noperator, estimating the state of a chain of arbitrary length $t$. This bound\nallows us to establish an $O(t^2)$ upper bound on the sample complexity of the\nlearning task, with an explicit dependence on the site dimension, realization\ndimension and spectral properties of a certain map constructed from the state.\nA refined error bound can be proven for $C^*$-finitely correlated states, which\nhave an operational interpretation in terms of sequential quantum channels\napplied to the memory system. We can also obtain an analogous error bound for a\nclass of matrix product density operators on a finite chain reconstructible by\nlocal marginals. In this case, a linear number of marginals must be estimated,\nobtaining a sample complexity of $\\tilde{O}(t^3)$. The learning algorithm also\nworks for states that are sufficiently close to a finitely correlated state,\nwith the potential of providing competitive algorithms for other interesting\nfamilies of states.\n","authors":["Marco Fanizza","Niklas Galke","Josep Lumbreras","Cambyse Rouzé","Andreas Winter"],"pdf_url":"https://arxiv.org/pdf/2312.07516v3.pdf","comment":"42 pages, 7 figures. Manuscript restructured, with minor corrections\n and clarifications"},{"id":"http://arxiv.org/abs/2403.00025v2","updated":"2025-03-06T15:29:41Z","published":"2024-02-28T15:19:33Z","title":"On the Challenges and Opportunities in Generative AI","summary":" The field of deep generative modeling has grown rapidly in the last few\nyears. With the availability of massive amounts of training data coupled with\nadvances in scalable unsupervised learning paradigms, recent large-scale\ngenerative models show tremendous promise in synthesizing high-resolution\nimages and text, as well as structured data such as videos and molecules.\nHowever, we argue that current large-scale generative AI models exhibit several\nfundamental shortcomings that hinder their widespread adoption across domains.\nIn this work, our objective is to identify these issues and highlight key\nunresolved challenges in modern generative AI paradigms that should be\naddressed to further enhance their capabilities, versatility, and reliability.\nBy identifying these challenges, we aim to provide researchers with insights\nfor exploring fruitful research directions, thus fostering the development of\nmore robust and accessible generative AI solutions.\n","authors":["Laura Manduchi","Kushagra Pandey","Clara Meister","Robert Bamler","Ryan Cotterell","Sina Däubener","Sophie Fellenz","Asja Fischer","Thomas Gärtner","Matthias Kirchler","Marius Kloft","Yingzhen Li","Christoph Lippert","Gerard de Melo","Eric Nalisnick","Björn Ommer","Rajesh Ranganath","Maja Rudolph","Karen Ullrich","Guy Van den Broeck","Julia E Vogt","Yixin Wang","Florian Wenzel","Frank Wood","Stephan Mandt","Vincent Fortuin"],"pdf_url":"https://arxiv.org/pdf/2403.00025v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07180v5","updated":"2025-03-06T15:26:56Z","published":"2024-11-11T17:57:30Z","title":"Gumbel Counterfactual Generation From Language Models","summary":" Understanding and manipulating the causal generation mechanisms in language\nmodels is essential for controlling their behavior. Previous work has primarily\nrelied on techniques such as representation surgery -- e.g., model ablations or\nmanipulation of linear subspaces tied to specific concepts -- to\n\\emph{intervene} on these models. To understand the impact of interventions\nprecisely, it is useful to examine \\emph{counterfactuals} -- e.g., how a given\nsentence would have appeared had it been generated by the model following a\nspecific intervention. We highlight that counterfactual reasoning is\nconceptually distinct from interventions, as articulated in Pearl's causal\nhierarchy. Based on this observation, we propose a framework for generating\ntrue string counterfactuals by reformulating language models as a structural\nequation model using the Gumbel-max trick, which we called Gumbel\ncounterfactual generation. This reformulation allows us to model the joint\ndistribution over original strings and their counterfactuals resulting from the\nsame instantiation of the sampling noise. We develop an algorithm based on\nhindsight Gumbel sampling that allows us to infer the latent noise variables\nand generate counterfactuals of observed strings. Our experiments demonstrate\nthat the approach produces meaningful counterfactuals while at the same time\nshowing that commonly used intervention techniques have considerable undesired\nside effects.\n","authors":["Shauli Ravfogel","Anej Svete","Vésteinn Snæbjarnarson","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2411.07180v5.pdf","comment":"Accepted in ICLR 2025"},{"id":"http://arxiv.org/abs/2503.04528v1","updated":"2025-03-06T15:16:57Z","published":"2025-03-06T15:16:57Z","title":"Federated Dynamic Modeling and Learning for Spatiotemporal Data\n Forecasting","summary":" This paper presents an advanced Federated Learning (FL) framework for\nforecasting complex spatiotemporal data, improving upon recent state-of-the-art\nmodels. In the proposed approach, the original Gated Recurrent Unit (GRU)\nmodule within previous Dynamic Spatial--Temporal Graph Convolutional Recurrent\nNetwork (DSTGCRN) modeling is first replaced with a Long Short-Term Memory\n(LSTM) network, enabling the resulting model to more effectively capture\nlong-term dependencies inherent to time series data. The resulting architecture\nsignificantly improves the model's capacity to handle complex temporal patterns\nin diverse forecasting applications. Furthermore, the proposed FL framework\nintegrates a novel Client-Side Validation (CSV) mechanism, introducing a\ncritical validation step at the client level before incorporating aggregated\nparameters from the central server into local models. This ensures that only\nthe most effective updates are adopted, improving both the robustness and\naccuracy of the forecasting model across clients. The efficiency of our\napproach is demonstrated through extensive experiments on real-world\napplications, including public datasets for multimodal transport demand\nforecasting and private datasets for Origin-Destination (OD) matrix forecasting\nin urban areas. The results demonstrate substantial improvements over\nconventional methods, highlighting the framework's ability to capture complex\nspatiotemporal dependencies while preserving data privacy. This work not only\nprovides a scalable and privacy-preserving solution for real-time,\nregion-specific forecasting and management but also underscores the potential\nof leveraging distributed data sources in a FL context. We provide our\nalgorithms as open-source on GitHub.\n","authors":["Thien Pham","Angelo Furno","Faïcel Chamroukhi","Latifa Oukhellou"],"pdf_url":"https://arxiv.org/pdf/2503.04528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.07775v2","updated":"2025-03-06T15:15:58Z","published":"2024-12-10T18:59:58Z","title":"Efficient Diversity-Preserving Diffusion Alignment via Gradient-Informed\n GFlowNets","summary":" While one commonly trains large diffusion models by collecting datasets on\ntarget downstream tasks, it is often desired to align and finetune pretrained\ndiffusion models with some reward functions that are either designed by experts\nor learned from small-scale datasets. Existing post-training methods for reward\nfinetuning of diffusion models typically suffer from lack of diversity in\ngenerated samples, lack of prior preservation, and/or slow convergence in\nfinetuning. Inspired by recent successes in generative flow networks\n(GFlowNets), a class of probabilistic models that sample with the unnormalized\ndensity of a reward function, we propose a novel GFlowNet method dubbed\nNabla-GFlowNet (abbreviated as \\methodname), the first GFlowNet method that\nleverages the rich signal in reward gradients, together with an objective\ncalled \\graddb plus its variant \\resgraddb designed for prior-preserving\ndiffusion finetuning. We show that our proposed method achieves fast yet\ndiversity- and prior-preserving finetuning of Stable Diffusion, a large-scale\ntext-conditioned image diffusion model, on different realistic reward\nfunctions.\n","authors":["Zhen Liu","Tim Z. Xiao","Weiyang Liu","Yoshua Bengio","Dinghuai Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.07775v2.pdf","comment":"Technical Report (35 pages, 31 figures), Accepted at ICLR 2025"},{"id":"http://arxiv.org/abs/2411.12580v2","updated":"2025-03-06T15:14:17Z","published":"2024-11-19T15:47:12Z","title":"Procedural Knowledge in Pretraining Drives Reasoning in Large Language\n Models","summary":" The capabilities and limitations of Large Language Models have been sketched\nout in great detail in recent years, providing an intriguing yet conflicting\npicture. On the one hand, LLMs demonstrate a general ability to solve problems.\nOn the other hand, they show surprising reasoning gaps when compared to humans,\ncasting doubt on the robustness of their generalisation strategies. The sheer\nvolume of data used in the design of LLMs has precluded us from applying the\nmethod traditionally used to measure generalisation: train-test set separation.\nTo overcome this, we study what kind of generalisation strategies LLMs employ\nwhen performing reasoning tasks by investigating the pretraining data they rely\non. For two models of different sizes (7B and 35B) and 2.5B of their\npretraining tokens, we identify what documents influence the model outputs for\nthree simple mathematical reasoning tasks and contrast this to the data that\nare influential for answering factual questions. We find that, while the models\nrely on mostly distinct sets of data for each factual question, a document\noften has a similar influence across different reasoning questions within the\nsame task, indicating the presence of procedural knowledge. We further find\nthat the answers to factual questions often show up in the most influential\ndata. However, for reasoning questions the answers usually do not show up as\nhighly influential, nor do the answers to the intermediate reasoning steps.\nWhen we characterise the top ranked documents for the reasoning questions\nqualitatively, we confirm that the influential documents often contain\nprocedural knowledge, like demonstrating how to obtain a solution using\nformulae or code. Our findings indicate that the approach to reasoning the\nmodels use is unlike retrieval, and more like a generalisable strategy that\nsynthesises procedural knowledge from documents doing a similar form of\nreasoning.\n","authors":["Laura Ruis","Maximilian Mozes","Juhan Bae","Siddhartha Rao Kamalakara","Dwarak Talupuru","Acyr Locatelli","Robert Kirk","Tim Rocktäschel","Edward Grefenstette","Max Bartolo"],"pdf_url":"https://arxiv.org/pdf/2411.12580v2.pdf","comment":"Published at ICLR 2025"},{"id":"http://arxiv.org/abs/2410.04166v2","updated":"2025-03-06T15:11:57Z","published":"2024-10-05T14:04:03Z","title":"Learning from negative feedback, or positive feedback or both","summary":" Existing preference optimization methods often assume scenarios where paired\npreference feedback (preferred/positive vs. dis-preferred/negative examples) is\navailable. This requirement limits their applicability in scenarios where only\nunpaired feedback--for example, either positive or negative--is available. To\naddress this, we introduce a novel approach that decouples learning from\npositive and negative feedback. This decoupling enables control over the\ninfluence of each feedback type and, importantly, allows learning even when\nonly one feedback type is present. A key contribution is demonstrating stable\nlearning from negative feedback alone, a capability not well-addressed by\ncurrent methods. Our approach builds upon the probabilistic framework\nintroduced in (Dayan and Hinton, 1997), which uses expectation-maximization\n(EM) to directly optimize the probability of positive outcomes (as opposed to\nclassic expected reward maximization). We address a key limitation in current\nEM-based methods: they solely maximize the likelihood of positive examples,\nwhile neglecting negative ones. We show how to extend EM algorithms to\nexplicitly incorporate negative examples, leading to a theoretically grounded\nalgorithm that offers an intuitive and versatile way to learn from both\npositive and negative feedback. We evaluate our approach for training language\nmodels based on human feedback as well as training policies for sequential\ndecision-making problems, where learned value functions are available.\n","authors":["Abbas Abdolmaleki","Bilal Piot","Bobak Shahriari","Jost Tobias Springenberg","Tim Hertweck","Rishabh Joshi","Junhyuk Oh","Michael Bloesch","Thomas Lampe","Nicolas Heess","Jonas Buchli","Martin Riedmiller"],"pdf_url":"https://arxiv.org/pdf/2410.04166v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04518v1","updated":"2025-03-06T15:06:01Z","published":"2025-03-06T15:06:01Z","title":"Leveraging priors on distribution functions for multi-arm bandits","summary":" We introduce Dirichlet Process Posterior Sampling (DPPS), a Bayesian\nnon-parametric algorithm for multi-arm bandits based on Dirichlet Process (DP)\npriors. Like Thompson-sampling, DPPS is a probability-matching algorithm, i.e.,\nit plays an arm based on its posterior-probability of being optimal. Instead of\nassuming a parametric class for the reward generating distribution of each arm,\nand then putting a prior on the parameters, in DPPS the reward generating\ndistribution is directly modeled using DP priors. DPPS provides a principled\napproach to incorporate prior belief about the bandit environment, and in the\nnoninformative limit of the DP posteriors (i.e. Bayesian Bootstrap), we recover\nNon Parametric Thompson Sampling (NPTS), a popular non-parametric bandit\nalgorithm, as a special case of DPPS. We employ stick-breaking representation\nof the DP priors, and show excellent empirical performance of DPPS in\nchallenging synthetic and real world bandit environments. Finally, using an\ninformation-theoretic analysis, we show non-asymptotic optimality of DPPS in\nthe Bayesian regret setup.\n","authors":["Sumit Vashishtha","Odalric-Ambrym Maillard"],"pdf_url":"https://arxiv.org/pdf/2503.04518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.05874v2","updated":"2025-03-06T15:02:33Z","published":"2025-02-09T12:23:40Z","title":"MMGDreamer: Mixed-Modality Graph for Geometry-Controllable 3D Indoor\n Scene Generation","summary":" Controllable 3D scene generation has extensive applications in virtual\nreality and interior design, where the generated scenes should exhibit high\nlevels of realism and controllability in terms of geometry. Scene graphs\nprovide a suitable data representation that facilitates these applications.\nHowever, current graph-based methods for scene generation are constrained to\ntext-based inputs and exhibit insufficient adaptability to flexible user\ninputs, hindering the ability to precisely control object geometry. To address\nthis issue, we propose MMGDreamer, a dual-branch diffusion model for scene\ngeneration that incorporates a novel Mixed-Modality Graph, visual enhancement\nmodule, and relation predictor. The mixed-modality graph allows object nodes to\nintegrate textual and visual modalities, with optional relationships between\nnodes. It enhances adaptability to flexible user inputs and enables meticulous\ncontrol over the geometry of objects in the generated scenes. The visual\nenhancement module enriches the visual fidelity of text-only nodes by\nconstructing visual representations using text embeddings. Furthermore, our\nrelation predictor leverages node representations to infer absent relationships\nbetween nodes, resulting in more coherent scene layouts. Extensive experimental\nresults demonstrate that MMGDreamer exhibits superior control of object\ngeometry, achieving state-of-the-art scene generation performance. Project\npage: https://yangzhifeio.github.io/project/MMGDreamer.\n","authors":["Zhifei Yang","Keyang Lu","Chao Zhang","Jiaxing Qi","Hanqi Jiang","Ruifei Ma","Shenglin Yin","Yifan Xu","Mingzhe Xing","Zhen Xiao","Jieyi Long","Xiangde Liu","Guangyao Zhai"],"pdf_url":"https://arxiv.org/pdf/2502.05874v2.pdf","comment":"Accepted by AAAI 2025 Main Track"},{"id":"http://arxiv.org/abs/2503.04509v1","updated":"2025-03-06T14:55:25Z","published":"2025-03-06T14:55:25Z","title":"STX-Search: Explanation Search for Continuous Dynamic Spatio-Temporal\n Models","summary":" Recent improvements in the expressive power of spatio-temporal models have\nled to performance gains in many real-world applications, such as traffic\nforecasting and social network modelling. However, understanding the\npredictions from a model is crucial to ensure reliability and trustworthiness,\nparticularly for high-risk applications, such as healthcare and transport. Few\nexisting methods are able to generate explanations for models trained on\ncontinuous-time dynamic graph data and, of these, the computational complexity\nand lack of suitable explanation objectives pose challenges. In this paper, we\npropose $\\textbf{S}$patio-$\\textbf{T}$emporal E$\\textbf{X}$planation\n$\\textbf{Search}$ (STX-Search), a novel method for generating instance-level\nexplanations that is applicable to static and dynamic temporal graph\nstructures. We introduce a novel search strategy and objective function, to\nfind explanations that are highly faithful and interpretable. When compared\nwith existing methods, STX-Search produces explanations of higher fidelity\nwhilst optimising explanation size to maintain interpretability.\n","authors":["Saif Anwar","Nathan Griffiths","Thomas Popham","Abhir Bhalerao"],"pdf_url":"https://arxiv.org/pdf/2503.04509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04507v1","updated":"2025-03-06T14:54:28Z","published":"2025-03-06T14:54:28Z","title":"A Morse Transform for Drug Discovery","summary":" We introduce a new ligand-based virtual screening (LBVS) framework that uses\npiecewise linear (PL) Morse theory to predict ligand binding potential. We\nmodel ligands as simplicial complexes via a pruned Delaunay triangulation, and\ncatalogue the critical points across multiple directional height functions.\nThis produces a rich feature vector, consisting of crucial topological features\n-- peaks, troughs, and saddles -- that characterise ligand surfaces relevant to\nbinding interactions. Unlike contemporary LBVS methods that rely on\ncomputationally-intensive deep neural networks, we require only a lightweight\nclassifier. The Morse theoretic approach achieves state-of-the-art performance\non standard datasets while offering an interpretable feature vector and\nscalable method for ligand prioritization in early-stage drug discovery.\n","authors":["Alexander M. Tanaka","Aras T. Asaad","Richard Cooper","Vidit Nanda"],"pdf_url":"https://arxiv.org/pdf/2503.04507v1.pdf","comment":"25 pages, 5 main figures, 2 main tables, 6 supplementary figures and\n 4 supplementary tables"},{"id":"http://arxiv.org/abs/2501.00020v2","updated":"2025-03-06T14:52:11Z","published":"2024-12-16T11:35:40Z","title":"Magnetic Field Data Calibration with Transformer Model Using Physical\n Constraints: A Scalable Method for Satellite Missions, Illustrated by\n Tianwen-1","summary":" This study introduces a novel approach that integrates the magnetic field\ndata correction from the Tianwen-1 Mars mission with a neural network\narchitecture constrained by physical principles derived from Maxwell's equation\nequations. By employing a Transformer based model capable of efficiently\nhandling sequential data, the method corrects measurement anomalies caused by\nsatellite dynamics, instrument interference, and environmental noise. As a\nresult, it significantly improves both the accuracy and the physical\nconsistency of the calibrated data. Compared to traditional methods that\nrequire long data segments and manual intervention often taking weeks or even\nmonths to complete this new approach can finish calibration in just minutes to\nhours, and predictions are made within seconds. This innovation not only\naccelerates the process of space weather modeling and planetary magnetospheric\nstudies but also provides a robust framework for future planetary exploration\nand solar wind interaction research.\n","authors":["Beibei Li","Yutian Chi","Yuming Wang"],"pdf_url":"https://arxiv.org/pdf/2501.00020v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04496v1","updated":"2025-03-06T14:44:25Z","published":"2025-03-06T14:44:25Z","title":"Learning Object Placement Programs for Indoor Scene Synthesis with\n Iterative Self Training","summary":" Data driven and autoregressive indoor scene synthesis systems generate indoor\nscenes automatically by suggesting and then placing objects one at a time.\nEmpirical observations show that current systems tend to produce incomplete\nnext object location distributions. We introduce a system which addresses this\nproblem. We design a Domain Specific Language (DSL) that specifies functional\nconstraints. Programs from our language take as input a partial scene and\nobject to place. Upon execution they predict possible object placements. We\ndesign a generative model which writes these programs automatically. Available\n3D scene datasets do not contain programs to train on, so we build upon\nprevious work in unsupervised program induction to introduce a new program\nbootstrapping algorithm. In order to quantify our empirical observations we\nintroduce a new evaluation procedure which captures how well a system models\nper-object location distributions. We ask human annotators to label all the\npossible places an object can go in a scene and show that our system produces\nper-object location distributions more consistent with human annotators. Our\nsystem also generates indoor scenes of comparable quality to previous systems\nand while previous systems degrade in performance when training data is sparse,\nour system does not degrade to the same degree.\n","authors":["Adrian Chang","Kai Wang","Yuanbo Li","Manolis Savva","Angel X. Chang","Daniel Ritchie"],"pdf_url":"https://arxiv.org/pdf/2503.04496v1.pdf","comment":"21 pages, 20 figures Subjects: Graphics (cs.GR), Computer Vision and\n Pattern Recognition (cs.CV), Machine Learning (cs.LG)"},{"id":"http://arxiv.org/abs/2503.04492v1","updated":"2025-03-06T14:40:21Z","published":"2025-03-06T14:40:21Z","title":"Accurate predictive model of band gap with selected important features\n based on explainable machine learning","summary":" In the rapidly advancing field of materials informatics, nonlinear machine\nlearning models have demonstrated exceptional predictive capabilities for\nmaterial properties. However, their black-box nature limits interpretability,\nand they may incorporate features that do not contribute to, or even\ndeteriorate, model performance. This study employs explainable ML (XML)\ntechniques, including permutation feature importance and the SHapley Additive\nexPlanation, applied to a pristine support vector regression model designed to\npredict band gaps at the GW level using 18 input features. Guided by\nXML-derived individual feature importance, a simple framework is proposed to\nconstruct reduced-feature predictive models. Model evaluations indicate that an\nXML-guided compact model, consisting of the top five features, achieves\ncomparable accuracy to the pristine model on in-domain datasets while\ndemonstrating superior generalization with lower prediction errors on\nout-of-domain data. Additionally, the study underscores the necessity for\neliminating strongly correlated features to prevent misinterpretation and\noverestimation of feature importance before applying XML. This study highlights\nXML's effectiveness in developing simplified yet highly accurate machine\nlearning models by clarifying feature roles.\n","authors":["Joohwi Lee","Kaito Miyamoto"],"pdf_url":"https://arxiv.org/pdf/2503.04492v1.pdf","comment":"9 pages, 4 figures, SI is included"},{"id":"http://arxiv.org/abs/2503.04483v1","updated":"2025-03-06T14:32:00Z","published":"2025-03-06T14:32:00Z","title":"InfoSEM: A Deep Generative Model with Informative Priors for Gene\n Regulatory Network Inference","summary":" Inferring Gene Regulatory Networks (GRNs) from gene expression data is\ncrucial for understanding biological processes. While supervised models are\nreported to achieve high performance for this task, they rely on costly ground\ntruth (GT) labels and risk learning gene-specific biases, such as class\nimbalances of GT interactions, rather than true regulatory mechanisms. To\naddress these issues, we introduce InfoSEM, an unsupervised generative model\nthat leverages textual gene embeddings as informative priors, improving GRN\ninference without GT labels. InfoSEM can also integrate GT labels as an\nadditional prior when available, avoiding biases and further enhancing\nperformance. Additionally, we propose a biologically motivated benchmarking\nframework that better reflects real-world applications such as biomarker\ndiscovery and reveals learned biases of existing supervised methods. InfoSEM\noutperforms existing models by 38.5% across four datasets using textual\nembeddings prior and further boosts performance by 11.1% when integrating\nlabeled data as priors.\n","authors":["Tianyu Cui","Song-Jun Xu","Artem Moskalev","Shuwei Li","Tommaso Mansi","Mangal Prakash","Rui Liao"],"pdf_url":"https://arxiv.org/pdf/2503.04483v1.pdf","comment":"ICLR 2025 AI4NA Oral, ICLR 2025 MLGenX Spotlight, ICLR 2025 LMRL"},{"id":"http://arxiv.org/abs/2503.04482v1","updated":"2025-03-06T14:30:55Z","published":"2025-03-06T14:30:55Z","title":"Generalized Interpolating Discrete Diffusion","summary":" While state-of-the-art language models achieve impressive results through\nnext-token prediction, they have inherent limitations such as the inability to\nrevise already generated tokens. This has prompted exploration of alternative\napproaches such as discrete diffusion. However, masked diffusion, which has\nemerged as a popular choice due to its simplicity and effectiveness,\nreintroduces this inability to revise words. To overcome this, we generalize\nmasked diffusion and derive the theoretical backbone of a family of general\ninterpolating discrete diffusion (GIDD) processes offering greater flexibility\nin the design of the noising processes. Leveraging a novel diffusion ELBO, we\nachieve compute-matched state-of-the-art performance in diffusion language\nmodeling. Exploiting GIDD's flexibility, we explore a hybrid approach combining\nmasking and uniform noise, leading to improved sample quality and unlocking the\nability for the model to correct its own mistakes, an area where autoregressive\nmodels notoriously have struggled. Our code and models are open-source:\nhttps://github.com/dvruette/gidd/\n","authors":["Dimitri von Rütte","Janis Fluri","Yuhui Ding","Antonio Orvieto","Bernhard Schölkopf","Thomas Hofmann"],"pdf_url":"https://arxiv.org/pdf/2503.04482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04480v1","updated":"2025-03-06T14:30:15Z","published":"2025-03-06T14:30:15Z","title":"Poisoning Bayesian Inference via Data Deletion and Replication","summary":" Research in adversarial machine learning (AML) has shown that statistical\nmodels are vulnerable to maliciously altered data. However, despite advances in\nBayesian machine learning models, most AML research remains concentrated on\nclassical techniques. Therefore, we focus on extending the white-box model\npoisoning paradigm to attack generic Bayesian inference, highlighting its\nvulnerability in adversarial contexts. A suite of attacks are developed that\nallow an attacker to steer the Bayesian posterior toward a target distribution\nthrough the strategic deletion and replication of true observations, even when\nonly sampling access to the posterior is available. Analytic properties of\nthese algorithms are proven and their performance is empirically examined in\nboth synthetic and real-world scenarios. With relatively little effort, the\nattacker is able to substantively alter the Bayesian's beliefs and, by\naccepting more risk, they can mold these beliefs to their will. By carefully\nconstructing the adversarial posterior, surgical poisoning is achieved such\nthat only targeted inferences are corrupted and others are minimally disturbed.\n","authors":["Matthieu Carreau","Roi Naveiro","William N. Caballero"],"pdf_url":"https://arxiv.org/pdf/2503.04480v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.13524v4","updated":"2025-03-06T14:27:12Z","published":"2025-02-19T08:21:59Z","title":"MobileViM: A Light-weight and Dimension-independent Vision Mamba for 3D\n Medical Image Analysis","summary":" Efficient evaluation of three-dimensional (3D) medical images is crucial for\ndiagnostic and therapeutic practices in healthcare. Recent years have seen a\nsubstantial uptake in applying deep learning and computer vision to analyse and\ninterpret medical images. Traditional approaches, such as convolutional neural\nnetworks (CNNs) and vision transformers (ViTs), face significant computational\nchallenges, prompting the need for architectural advancements. Recent efforts\nhave led to the introduction of novel architectures like the ``Mamba'' model as\nalternative solutions to traditional CNNs or ViTs. The Mamba model excels in\nthe linear processing of one-dimensional data with low computational demands.\nHowever, Mamba's potential for 3D medical image analysis remains underexplored\nand could face significant computational challenges as the dimension increases.\nThis manuscript presents MobileViM, a streamlined architecture for efficient\nsegmentation of 3D medical images. In the MobileViM network, we invent a new\ndimension-independent mechanism and a dual-direction traversing approach to\nincorporate with a vision-Mamba-based framework. MobileViM also features a\ncross-scale bridging technique to improve efficiency and accuracy across\nvarious medical imaging modalities. With these enhancements, MobileViM achieves\nsegmentation speeds exceeding 90 frames per second (FPS) on a single graphics\nprocessing unit (i.e., NVIDIA RTX 4090). This performance is over 24 FPS faster\nthan the state-of-the-art deep learning models for processing 3D images with\nthe same computational resources. In addition, experimental evaluations\ndemonstrate that MobileViM delivers superior performance, with Dice similarity\nscores reaching 92.72%, 86.69%, 80.46%, and 77.43% for PENGWIN, BraTS2024,\nATLAS, and Toothfairy2 datasets, respectively, which significantly surpasses\nexisting models.\n","authors":["Wei Dai","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2502.13524v4.pdf","comment":"The corresponding author disagrees with the manuscript submitted to\n arXiv"},{"id":"http://arxiv.org/abs/2503.03454v2","updated":"2025-03-06T14:25:03Z","published":"2025-03-05T12:40:34Z","title":"Data Poisoning Attacks to Locally Differentially Private Range Query\n Protocols","summary":" Local Differential Privacy (LDP) has been widely adopted to protect user\nprivacy in decentralized data collection. However, recent studies have revealed\nthat LDP protocols are vulnerable to data poisoning attacks, where malicious\nusers manipulate their reported data to distort aggregated results. In this\nwork, we present the first study on data poisoning attacks targeting LDP range\nquery protocols, focusing on both tree-based and grid-based approaches. We\nidentify three key challenges in executing such attacks, including crafting\nconsistent and effective fake data, maintaining data consistency across levels\nor grids, and preventing server detection. To address the first two challenges,\nwe propose novel attack methods that are provably optimal, including a\ntree-based attack and a grid-based attack, designed to manipulate range query\nresults with high effectiveness. \\textbf{Our key finding is that the common\npost-processing procedure, Norm-Sub, in LDP range query protocols can help the\nattacker massively amplify their attack effectiveness.} In addition, we study a\npotential countermeasure, but also propose an adaptive attack capable of\nevading this defense to address the third challenge. We evaluate our methods\nthrough theoretical analysis and extensive experiments on synthetic and\nreal-world datasets. Our results show that the proposed attacks can\nsignificantly amplify estimations for arbitrary range queries by manipulating a\nsmall fraction of users, providing 5-10x more influence than a normal user to\nthe estimation.\n","authors":["Ting-Wei Liao","Chih-Hsun Lin","Yu-Lin Tsai","Takao Murakami","Chia-Mu Yu","Jun Sakuma","Chun-Ying Huang","Hiroaki Kikuchi"],"pdf_url":"https://arxiv.org/pdf/2503.03454v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04474v1","updated":"2025-03-06T14:24:12Z","published":"2025-03-06T14:24:12Z","title":"Know Thy Judge: On the Robustness Meta-Evaluation of LLM Safety Judges","summary":" Large Language Model (LLM) based judges form the underpinnings of key safety\nevaluation processes such as offline benchmarking, automated red-teaming, and\nonline guardrailing. This widespread requirement raises the crucial question:\ncan we trust the evaluations of these evaluators? In this paper, we highlight\ntwo critical challenges that are typically overlooked: (i) evaluations in the\nwild where factors like prompt sensitivity and distribution shifts can affect\nperformance and (ii) adversarial attacks that target the judge. We highlight\nthe importance of these through a study of commonly used safety judges, showing\nthat small changes such as the style of the model output can lead to jumps of\nup to 0.24 in the false negative rate on the same dataset, whereas adversarial\nattacks on the model generation can fool some judges into misclassifying 100%\nof harmful generations as safe ones. These findings reveal gaps in commonly\nused meta-evaluation benchmarks and weaknesses in the robustness of current LLM\njudges, indicating that low attack success under certain judges could create a\nfalse sense of security.\n","authors":["Francisco Eiras","Eliott Zemour","Eric Lin","Vaikkunth Mugunthan"],"pdf_url":"https://arxiv.org/pdf/2503.04474v1.pdf","comment":"Accepted to the ICBINB Workshop at ICLR'25"},{"id":"http://arxiv.org/abs/2503.04472v1","updated":"2025-03-06T14:23:06Z","published":"2025-03-06T14:23:06Z","title":"DAST: Difficulty-Adaptive Slow-Thinking for Large Reasoning Models","summary":" Recent advancements in slow-thinking reasoning models have shown exceptional\nperformance in complex reasoning tasks. However, these models often exhibit\noverthinking-generating redundant reasoning steps for simple problems, leading\nto excessive computational resource usage. While current mitigation strategies\nuniformly reduce reasoning tokens, they risk degrading performance on\nchallenging tasks that require extended reasoning. This paper introduces\nDifficulty-Adaptive Slow-Thinking (DAST), a novel framework that enables models\nto autonomously adjust the length of Chain-of-Thought(CoT) based on problem\ndifficulty. We first propose a Token Length Budget (TLB) metric to quantify\ndifficulty, then leveraging length-aware reward shaping and length preference\noptimization to implement DAST. DAST penalizes overlong responses for simple\ntasks while incentivizing sufficient reasoning for complex problems.\nExperiments on diverse datasets and model scales demonstrate that DAST\neffectively mitigates overthinking (reducing token usage by over 30\\% on\naverage) while preserving reasoning accuracy on complex problems.\n","authors":["Yi Shen","Jian Zhang","Jieyun Huang","Shuming Shi","Wenjing Zhang","Jiangze Yan","Ning Wang","Kai Wang","Shiguo Lian"],"pdf_url":"https://arxiv.org/pdf/2503.04472v1.pdf","comment":"working in progress"},{"id":"http://arxiv.org/abs/2503.04469v1","updated":"2025-03-06T14:19:55Z","published":"2025-03-06T14:19:55Z","title":"An artificially intelligent magnetic resonance spectroscopy\n quantification method: Comparison between QNet and LCModel on the cloud\n computing platform CloudBrain-MRS","summary":" Objctives: This work aimed to statistically compare the metabolite\nquantification of human brain magnetic resonance spectroscopy (MRS) between the\ndeep learning method QNet and the classical method LCModel through an\neasy-to-use intelligent cloud computing platform CloudBrain-MRS. Materials and\nMethods: In this retrospective study, two 3 T MRI scanners Philips Ingenia and\nAchieva collected 61 and 46 in vivo 1H magnetic resonance (MR) spectra of\nhealthy participants, respectively, from the brain region of pregenual anterior\ncingulate cortex from September to October 2021. The analyses of Bland-Altman,\nPearson correlation and reasonability were performed to assess the degree of\nagreement, linear correlation and reasonability between the two quantification\nmethods. Results: Fifteen healthy volunteers (12 females and 3 males, age\nrange: 21-35 years, mean age/standard deviation = 27.4/3.9 years) were\nrecruited. The analyses of Bland-Altman, Pearson correlation and reasonability\nshowed high to good consistency and very strong to moderate correlation between\nthe two methods for quantification of total N-acetylaspartate (tNAA), total\ncholine (tCho), and inositol (Ins) (relative half interval of limits of\nagreement = 3.04%, 9.3%, and 18.5%, respectively; Pearson correlation\ncoefficient r = 0.775, 0.927, and 0.469, respectively). In addition,\nquantification results of QNet are more likely to be closer to the previous\nreported average values than those of LCModel. Conclusion: There were high or\ngood degrees of consistency between the quantification results of QNet and\nLCModel for tNAA, tCho, and Ins, and QNet generally has more reasonable\nquantification than LCModel.\n","authors":["Meijin Lin","Lin Guo","Dicheng Chen","Jianshu Chen","Zhangren Tu","Xu Huang","Jianhua Wang","Ji Qi","Yuan Long","Zhiguo Huang","Di Guo","Xiaobo Qu","Haiwei Han"],"pdf_url":"https://arxiv.org/pdf/2503.04469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04462v1","updated":"2025-03-06T14:13:59Z","published":"2025-03-06T14:13:59Z","title":"PALo: Learning Posture-Aware Locomotion for Quadruped Robots","summary":" With the rapid development of embodied intelligence, locomotion control of\nquadruped robots on complex terrains has become a research hotspot. Unlike\ntraditional locomotion control approaches focusing solely on velocity tracking,\nwe pursue to balance the agility and robustness of quadruped robots on diverse\nand complex terrains. To this end, we propose an end-to-end deep reinforcement\nlearning framework for posture-aware locomotion named PALo, which manages to\nhandle simultaneous linear and angular velocity tracking and real-time\nadjustments of body height, pitch, and roll angles. In PALo, the locomotion\ncontrol problem is formulated as a partially observable Markov decision\nprocess, and an asymmetric actor-critic architecture is adopted to overcome the\nsim-to-real challenge. Further, by incorporating customized training curricula,\nPALo achieves agile posture-aware locomotion control in simulated environments\nand successfully transfers to real-world settings without fine-tuning, allowing\nreal-time control of the quadruped robot's locomotion and body posture across\nchallenging terrains. Through in-depth experimental analysis, we identify the\nkey components of PALo that contribute to its performance, further validating\nthe effectiveness of the proposed method. The results of this study provide new\npossibilities for the low-level locomotion control of quadruped robots in\nhigher dimensional command spaces and lay the foundation for future research on\nupper-level modules for embodied intelligence.\n","authors":["Xiangyu Miao","Jun Sun","Hang Lai","Xinpeng Di","Jiahang Cao","Yong Yu","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.04462v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.01684v2","updated":"2025-03-06T14:07:34Z","published":"2025-03-03T15:58:15Z","title":"An Efficient Learning Method to Connect Observables","summary":" Constructing fast and accurate surrogate models is a key ingredient for\nmaking robust predictions in many topics. We introduce a new model, the\nMultiparameter Eigenvalue Problem (MEP) emulator. The new method connects\nemulators and can make predictions directly from observables to observables. We\npresent that the MEP emulator can be trained with data from Eigenvector\nContinuation (EC) and Parametric Matrix Model (PMM) emulators. A simple\nsimulation on a one-dimensional lattice confirms the performance of the MEP\nemulator. Using $^{28}$O as an example, we also demonstrate that the predictive\nprobability distribution of the target observables can be easily obtained\nthrough the new emulator.\n","authors":["Hang Yu","Takayuki Miyagi"],"pdf_url":"https://arxiv.org/pdf/2503.01684v2.pdf","comment":"5+2 pages, 4 figures, updated acknowledgment"},{"id":"http://arxiv.org/abs/2503.04453v1","updated":"2025-03-06T14:06:50Z","published":"2025-03-06T14:06:50Z","title":"Reproducibility Assessment of Magnetic Resonance Spectroscopy of\n Pregenual Anterior Cingulate Cortex across Sessions and Vendors via the Cloud\n Computing Platform CloudBrain-MRS","summary":" Given the need to elucidate the mechanisms underlying illnesses and their\ntreatment, as well as the lack of harmonization of acquisition and\npost-processing protocols among different magnetic resonance system vendors,\nthis work is to determine if metabolite concentrations obtained from different\nsessions, machine models and even different vendors of 3 T scanners can be\nhighly reproducible and be pooled for diagnostic analysis, which is very\nvaluable for the research of rare diseases. Participants underwent magnetic\nresonance imaging (MRI) scanning once on two separate days within one week (one\nsession per day, each session including two proton magnetic resonance\nspectroscopy (1H-MRS) scans with no more than a 5-minute interval between scans\n(no off-bed activity)) on each machine. were analyzed for reliability of\nwithin- and between- sessions using the coefficient of variation (CV) and\nintraclass correlation coefficient (ICC), and for reproducibility of across the\nmachines using correlation coefficient. As for within- and between- session,\nall CV values for a group of all the first or second scans of a session, or for\na session were almost below 20%, and most of the ICCs for metabolites range\nfrom moderate (0.4-0.59) to excellent (0.75-1), indicating high data\nreliability. When it comes to the reproducibility across the three scanners,\nall Pearson correlation coefficients across the three machines approached 1\nwith most around 0.9, and majority demonstrated statistical significance\n(P<0.01). Additionally, the intra-vendor reproducibility was greater than the\ninter-vendor ones.\n","authors":["Runhan Chen","Meijin Lin","Jianshu Chen","Liangjie Lin","Jiazheng Wang","Xiaoqing Li","Jianhua Wang","Xu Huang","Ling Qian","Shaoxing Liu","Yuan Long","Di Guo","Xiaobo Qu","Haiwei Han"],"pdf_url":"https://arxiv.org/pdf/2503.04453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04451v1","updated":"2025-03-06T14:06:20Z","published":"2025-03-06T14:06:20Z","title":"Privacy Preserving and Robust Aggregation for Cross-Silo Federated\n Learning in Non-IID Settings","summary":" Federated Averaging remains the most widely used aggregation strategy in\nfederated learning due to its simplicity and scalability. However, its\nperformance degrades significantly in non-IID data settings, where client\ndistributions are highly imbalanced or skewed. Additionally, it relies on\nclients transmitting metadata, specifically the number of training samples,\nwhich introduces privacy risks and may conflict with regulatory frameworks like\nthe European GDPR. In this paper, we propose a novel aggregation strategy that\naddresses these challenges by introducing class-aware gradient masking. Unlike\ntraditional approaches, our method relies solely on gradient updates,\neliminating the need for any additional client metadata, thereby enhancing\nprivacy protection. Furthermore, our approach validates and dynamically weights\nclient contributions based on class-specific importance, ensuring robustness\nagainst non-IID distributions, convergence prevention, and backdoor attacks.\nExtensive experiments on benchmark datasets demonstrate that our method not\nonly outperforms FedAvg and other widely accepted aggregation strategies in\nnon-IID settings but also preserves model integrity in adversarial scenarios.\nOur results establish the effectiveness of gradient masking as a practical and\nsecure solution for federated learning.\n","authors":["Marco Arazzi","Mert Cihangiroglu","Antonino Nocera"],"pdf_url":"https://arxiv.org/pdf/2503.04451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04447v1","updated":"2025-03-06T14:02:28Z","published":"2025-03-06T14:02:28Z","title":"A Graph-Partitioning Based Continuous Optimization Approach to\n Semi-supervised Clustering Problems","summary":" Semi-supervised clustering is a basic problem in various applications. Most\nexisting methods require knowledge of the ideal cluster number, which is often\ndifficult to obtain in practice. Besides, satisfying the must-link constraints\nis another major challenge for these methods. In this work, we view the\nsemi-supervised clustering task as a partitioning problem on a graph associated\nwith the given dataset, where the similarity matrix includes a scaling\nparameter to reflect the must-link constraints. Utilizing a relaxation\ntechnique, we formulate the graph partitioning problem into a continuous\noptimization model that does not require the exact cluster number, but only an\noverestimate of it. We then propose a block coordinate descent algorithm to\nefficiently solve this model, and establish its convergence result. Based on\nthe obtained solution, we can construct the clusters that theoretically meet\nthe must-link constraints under mild assumptions. Furthermore, we verify the\neffectiveness and efficiency of our proposed method through comprehensive\nnumerical experiments.\n","authors":["Wei Liu","Xin Liu","Michael K. Ng","Zaikun Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.04447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.08010v2","updated":"2025-03-06T14:01:48Z","published":"2024-02-12T19:18:50Z","title":"Which Frequencies do CNNs Need? Emergent Bottleneck Structure in Feature\n Learning","summary":" We describe the emergence of a Convolution Bottleneck (CBN) structure in\nCNNs, where the network uses its first few layers to transform the input\nrepresentation into a representation that is supported only along a few\nfrequencies and channels, before using the last few layers to map back to the\noutputs. We define the CBN rank, which describes the number and type of\nfrequencies that are kept inside the bottleneck, and partially prove that the\nparameter norm required to represent a function $f$ scales as depth times the\nCBN rank $f$. We also show that the parameter norm depends at next order on the\nregularity of $f$. We show that any network with almost optimal parameter norm\nwill exhibit a CBN structure in both the weights and - under the assumption\nthat the network is stable under large learning rate - the activations, which\nmotivates the common practice of down-sampling; and we verify that the CBN\nresults still hold with down-sampling. Finally we use the CBN structure to\ninterpret the functions learned by CNNs on a number of tasks.\n","authors":["Yuxiao Wen","Arthur Jacot"],"pdf_url":"https://arxiv.org/pdf/2402.08010v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17573v2","updated":"2025-03-06T13:47:53Z","published":"2024-05-27T18:15:05Z","title":"Hamiltonian Mechanics of Feature Learning: Bottleneck Structure in Leaky\n ResNets","summary":" We study Leaky ResNets, which interpolate between ResNets and Fully-Connected\nnets depending on an 'effective depth' hyper-parameter $\\tilde{L}$. In the\ninfinite depth limit, we study 'representation geodesics' $A_{p}$: continuous\npaths in representation space (similar to NeuralODEs) from input $p=0$ to\noutput $p=1$ that minimize the parameter norm of the network. We give a\nLagrangian and Hamiltonian reformulation, which highlight the importance of two\nterms: a kinetic energy which favors small layer derivatives\n$\\partial_{p}A_{p}$ and a potential energy that favors low-dimensional\nrepresentations, as measured by the 'Cost of Identity'. The balance between\nthese two forces offers an intuitive understanding of feature learning in\nResNets. We leverage this intuition to explain the emergence of a bottleneck\nstructure, as observed in previous work: for large $\\tilde{L}$ the potential\nenergy dominates and leads to a separation of timescales, where the\nrepresentation jumps rapidly from the high dimensional inputs to a\nlow-dimensional representation, move slowly inside the space of low-dimensional\nrepresentations, before jumping back to the potentially high-dimensional\noutputs. Inspired by this phenomenon, we train with an adaptive layer step-size\nto adapt to the separation of timescales.\n","authors":["Arthur Jacot","Alexandre Kaiser"],"pdf_url":"https://arxiv.org/pdf/2405.17573v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.05664v2","updated":"2025-03-06T13:40:09Z","published":"2024-07-08T06:59:29Z","title":"How DNNs break the Curse of Dimensionality: Compositionality and\n Symmetry Learning","summary":" We show that deep neural networks (DNNs) can efficiently learn any\ncomposition of functions with bounded $F_{1}$-norm, which allows DNNs to break\nthe curse of dimensionality in ways that shallow networks cannot. More\nspecifically, we derive a generalization bound that combines a covering number\nargument for compositionality, and the $F_{1}$-norm (or the related Barron\nnorm) for large width adaptivity. We show that the global minimizer of the\nregularized loss of DNNs can fit for example the composition of two functions\n$f^{*}=h\\circ g$ from a small number of observations, assuming $g$ is\nsmooth/regular and reduces the dimensionality (e.g. $g$ could be the quotient\nmap of the symmetries of $f^{*}$), so that $h$ can be learned in spite of its\nlow regularity. The measures of regularity we consider is the Sobolev norm with\ndifferent levels of differentiability, which is well adapted to the $F_{1}$\nnorm. We compute scaling laws empirically and observe phase transitions\ndepending on whether $g$ or $h$ is harder to learn, as predicted by our theory.\n","authors":["Arthur Jacot","Seok Hoan Choi","Yuxiao Wen"],"pdf_url":"https://arxiv.org/pdf/2407.05664v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.12261v3","updated":"2025-03-06T13:39:32Z","published":"2024-10-16T05:58:55Z","title":"CATCH: Channel-Aware multivariate Time Series Anomaly Detection via\n Frequency Patching","summary":" Anomaly detection in multivariate time series is challenging as heterogeneous\nsubsequence anomalies may occur. Reconstruction-based methods, which focus on\nlearning normal patterns in the frequency domain to detect diverse abnormal\nsubsequences, achieve promising results, while still falling short on capturing\nfine-grained frequency characteristics and channel correlations. To contend\nwith the limitations, we introduce CATCH, a framework based on frequency\npatching. We propose to patchify the frequency domain into frequency bands,\nwhich enhances its ability to capture fine-grained frequency characteristics.\nTo perceive appropriate channel correlations, we propose a Channel Fusion\nModule (CFM), which features a patch-wise mask generator and a masked-attention\nmechanism. Driven by a bi-level multi-objective optimization algorithm, the CFM\nis encouraged to iteratively discover appropriate patch-wise channel\ncorrelations, and to cluster relevant channels while isolating adverse effects\nfrom irrelevant channels. Extensive experiments on 10 real-world datasets and\n12 synthetic datasets demonstrate that CATCH achieves state-of-the-art\nperformance. We make our code and datasets available at\nhttps://github.com/decisionintelligence/CATCH.\n","authors":["Xingjian Wu","Xiangfei Qiu","Zhengyu Li","Yihang Wang","Jilin Hu","Chenjuan Guo","Hui Xiong","Bin Yang"],"pdf_url":"https://arxiv.org/pdf/2410.12261v3.pdf","comment":"Accepted by ICLR 2025"},{"id":"http://arxiv.org/abs/2503.04426v1","updated":"2025-03-06T13:35:59Z","published":"2025-03-06T13:35:59Z","title":"FORTALESA: Fault-Tolerant Reconfigurable Systolic Array for DNN\n Inference","summary":" The emergence of Deep Neural Networks (DNNs) in mission- and safety-critical\napplications brings their reliability to the front. High performance demands of\nDNNs require the use of specialized hardware accelerators. Systolic array\narchitecture is widely used in DNN accelerators due to its parallelism and\nregular structure. This work presents a run-time reconfigurable systolic array\narchitecture with three execution modes and four implementation options. All\nfour implementations are evaluated in terms of resource utilization,\nthroughput, and fault tolerance improvement. The proposed architecture is used\nfor reliability enhancement of DNN inference on systolic array through\nheterogeneous mapping of different network layers to different execution modes.\nThe approach is supported by a novel reliability assessment method based on\nfault propagation analysis. It is used for the exploration of the appropriate\nexecution mode-layer mapping for DNN inference. The proposed architecture\nefficiently protects registers and MAC units of systolic array PEs from\ntransient and permanent faults. The reconfigurability feature enables a speedup\nof up to $3\\times$, depending on layer vulnerability. Furthermore, it requires\n$6\\times$ less resources compared to static redundancy and $2.5\\times$ less\nresources compared to the previously proposed solution for transient faults.\n","authors":["Natalia Cherezova","Artur Jutman","Maksim Jenihhin"],"pdf_url":"https://arxiv.org/pdf/2503.04426v1.pdf","comment":"11 pages, 15 figures"},{"id":"http://arxiv.org/abs/2503.04424v1","updated":"2025-03-06T13:32:13Z","published":"2025-03-06T13:32:13Z","title":"Determinant Estimation under Memory Constraints and Neural Scaling Laws","summary":" Calculating or accurately estimating log-determinants of large positive\nsemi-definite matrices is of fundamental importance in many machine learning\ntasks. While its cubic computational complexity can already be prohibitive, in\nmodern applications, even storing the matrices themselves can pose a memory\nbottleneck. To address this, we derive a novel hierarchical algorithm based on\nblock-wise computation of the LDL decomposition for large-scale log-determinant\ncalculation in memory-constrained settings. In extreme cases where matrices are\nhighly ill-conditioned, accurately computing the full matrix itself may be\ninfeasible. This is particularly relevant when considering kernel matrices at\nscale, including the empirical Neural Tangent Kernel (NTK) of neural networks\ntrained on large datasets. Under the assumption of neural scaling laws in the\ntest error, we show that the ratio of pseudo-determinants satisfies a power-law\nrelationship, allowing us to derive corresponding scaling laws. This enables\naccurate estimation of NTK log-determinants from a tiny fraction of the full\ndataset; in our experiments, this results in a $\\sim$100,000$\\times$ speedup\nwith improved accuracy over competing approximations. Using these techniques,\nwe successfully estimate log-determinants for dense matrices of extreme sizes,\nwhich were previously deemed intractable and inaccessible due to their enormous\nscale and computational demands.\n","authors":["Siavash Ameli","Chris van der Heide","Liam Hodgkinson","Fred Roosta","Michael W. Mahoney"],"pdf_url":"https://arxiv.org/pdf/2503.04424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07978v4","updated":"2025-03-06T13:29:24Z","published":"2023-11-14T08:10:14Z","title":"AfroBench: How Good are Large Language Models on African Languages?","summary":" Large-scale multilingual evaluations, such as MEGA, often include only a\nhandful of African languages due to the scarcity of high-quality evaluation\ndata and the limited discoverability of existing African datasets. This lack of\nrepresentation hinders comprehensive LLM evaluation across a diverse range of\nlanguages and tasks. To address these challenges, we introduce AfroBench -- a\nmulti-task benchmark for evaluating the performance of LLMs across 64 African\nlanguages, 15 tasks and 22 datasets. AfroBench consists of nine natural\nlanguage understanding datasets, six text generation datasets, six knowledge\nand question answering tasks, and one mathematical reasoning task. We present\nresults comparing the performance of prompting LLMs to fine-tuned baselines\nbased on BERT and T5-style models. Our results suggest large gaps in\nperformance between high-resource languages, such as English, and African\nlanguages across most tasks; but performance also varies based on the\navailability of monolingual data resources. Our findings confirm that\nperformance on African languages continues to remain a hurdle for current LLMs,\nunderscoring the need for additional efforts to close this gap.\n https://mcgill-nlp.github.io/AfroBench/\n","authors":["Jessica Ojo","Odunayo Ogundepo","Akintunde Oladipo","Kelechi Ogueji","Jimmy Lin","Pontus Stenetorp","David Ifeoluwa Adelani"],"pdf_url":"https://arxiv.org/pdf/2311.07978v4.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2503.04418v1","updated":"2025-03-06T13:21:38Z","published":"2025-03-06T13:21:38Z","title":"AOLO: Analysis and Optimization For Low-Carbon Oriented Wireless Large\n Language Model Services","summary":" Recent advancements in large language models (LLMs) have led to their\nwidespread adoption and large-scale deployment across various domains. However,\ntheir environmental impact, particularly during inference, has become a growing\nconcern due to their substantial energy consumption and carbon footprint.\nExisting research has focused on inference computation alone, overlooking the\nanalysis and optimization of carbon footprint in network-aided LLM service\nsystems. To address this gap, we propose AOLO, a framework for analysis and\noptimization for low-carbon oriented wireless LLM services. AOLO introduces a\ncomprehensive carbon footprint model that quantifies greenhouse gas emissions\nacross the entire LLM service chain, including computational inference and\nwireless communication. Furthermore, we formulate an optimization problem aimed\nat minimizing the overall carbon footprint, which is solved through joint\noptimization of inference outputs and transmit power under\nquality-of-experience and system performance constraints. To achieve this joint\noptimization, we leverage the energy efficiency of spiking neural networks\n(SNNs) by adopting SNN as the actor network and propose a low-carbon-oriented\noptimization algorithm, i.e., SNN-based deep reinforcement learning (SDRL).\nComprehensive simulations demonstrate that SDRL algorithm significantly reduces\noverall carbon footprint, achieving an 18.77% reduction compared to the\nbenchmark soft actor-critic, highlighting its potential for enabling more\nsustainable LLM inference services.\n","authors":["Xiaoqi Wang","Hongyang Du","Yuehong Gao","Dong In Kim"],"pdf_url":"https://arxiv.org/pdf/2503.04418v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04416v1","updated":"2025-03-06T13:18:37Z","published":"2025-03-06T13:18:37Z","title":"Learning Transformer-based World Models with Contrastive Predictive\n Coding","summary":" The DreamerV3 algorithm recently obtained remarkable performance across\ndiverse environment domains by learning an accurate world model based on\nRecurrent Neural Networks (RNNs). Following the success of model-based\nreinforcement learning algorithms and the rapid adoption of the Transformer\narchitecture for its superior training efficiency and favorable scaling\nproperties, recent works such as STORM have proposed replacing RNN-based world\nmodels with Transformer-based world models using masked self-attention.\nHowever, despite the improved training efficiency of these methods, their\nimpact on performance remains limited compared to the Dreamer algorithm,\nstruggling to learn competitive Transformer-based world models. In this work,\nwe show that the next state prediction objective adopted in previous approaches\nis insufficient to fully exploit the representation capabilities of\nTransformers. We propose to extend world model predictions to longer time\nhorizons by introducing TWISTER (Transformer-based World model wIth contraSTivE\nRepresentations), a world model using action-conditioned Contrastive Predictive\nCoding to learn high-level temporal feature representations and improve the\nagent performance. TWISTER achieves a human-normalized mean score of 162% on\nthe Atari 100k benchmark, setting a new record among state-of-the-art methods\nthat do not employ look-ahead search.\n","authors":["Maxime Burchi","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2503.04416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04406v1","updated":"2025-03-06T13:00:53Z","published":"2025-03-06T13:00:53Z","title":"Training-Free Graph Filtering via Multimodal Feature Refinement for\n Extremely Fast Multimodal Recommendation","summary":" Multimodal recommender systems improve the performance of canonical\nrecommender systems with no item features by utilizing diverse content types\nsuch as text, images, and videos, while alleviating inherent sparsity of\nuser-item interactions and accelerating user engagement. However, current\nneural network-based models often incur significant computational overhead due\nto the complex training process required to learn and integrate information\nfrom multiple modalities. To overcome this limitation, we propose\nMultiModal-Graph Filtering (MM-GF), a training-free method based on the notion\nof graph filtering (GF) for efficient and accurate multimodal recommendations.\nSpecifically, MM-GF first constructs multiple similarity graphs through\nnontrivial multimodal feature refinement such as robust scaling and vector\nshifting by addressing the heterogeneous characteristics across modalities.\nThen, MM-GF optimally fuses multimodal information using linear low-pass\nfilters across different modalities. Extensive experiments on real-world\nbenchmark datasets demonstrate that MM-GF not only improves recommendation\naccuracy by up to 13.35% compared to the best competitor but also dramatically\nreduces computational costs by achieving the runtime of less than 10 seconds.\n","authors":["Yu-Seung Roh","Joo-Young Kim","Jin-Duk Park","Won-Yong Shin"],"pdf_url":"https://arxiv.org/pdf/2503.04406v1.pdf","comment":"10 pages, 6 figures, 6 tables"},{"id":"http://arxiv.org/abs/2503.04404v1","updated":"2025-03-06T12:58:09Z","published":"2025-03-06T12:58:09Z","title":"Temporal Analysis of NetFlow Datasets for Network Intrusion Detection\n Systems","summary":" This paper investigates the temporal analysis of NetFlow datasets for machine\nlearning (ML)-based network intrusion detection systems (NIDS). Although many\nprevious studies have highlighted the critical role of temporal features, such\nas inter-packet arrival time and flow length/duration, in NIDS, the currently\navailable NetFlow datasets for NIDS lack these temporal features. This study\naddresses this gap by creating and making publicly available a set of NetFlow\ndatasets that incorporate these temporal features [1]. With these temporal\nfeatures, we provide a comprehensive temporal analysis of NetFlow datasets by\nexamining the distribution of various features over time and presenting\ntime-series representations of NetFlow features. This temporal analysis has not\nbeen previously provided in the existing literature. We also borrowed an idea\nfrom signal processing, time frequency analysis, and tested it to see how\ndifferent the time frequency signal presentations (TFSPs) are for various\nattacks. The results indicate that many attacks have unique patterns, which\ncould help ML models to identify them more easily.\n","authors":["Majed Luay","Siamak Layeghy","Seyedehfaezeh Hosseininoorbin","Mohanad Sarhan","Nour Moustafa","Marius Portmann"],"pdf_url":"https://arxiv.org/pdf/2503.04404v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04398v1","updated":"2025-03-06T12:52:22Z","published":"2025-03-06T12:52:22Z","title":"Speculative MoE: Communication Efficient Parallel MoE Inference with\n Speculative Token and Expert Pre-scheduling","summary":" MoE (Mixture of Experts) prevails as a neural architecture that can scale\nmodern transformer-based LLMs (Large Language Models) to unprecedented scales.\nNevertheless, large MoEs' great demands of computing power, memory capacity and\nmemory bandwidth make scalable serving a fundamental challenge and efficient\nparallel inference has become a requisite to attain adequate throughput under\nlatency constraints. DeepSpeed-MoE, one state-of-the-art MoE inference\nframework, adopts a 3D-parallel paradigm including EP (Expert Parallelism), TP\n(Tensor Parallel) and DP (Data Parallelism). However, our analysis shows\nDeepSpeed-MoE's inference efficiency is largely bottlenecked by EP, which is\nimplemented with costly all-to-all collectives to route token activation. Our\nwork aims to boost DeepSpeed-MoE by strategically reducing EP's communication\noverhead with a technique named Speculative MoE. Speculative MoE has two\nspeculative parallelization schemes, speculative token shuffling and\nspeculative expert grouping, which predict outstanding tokens' expert routing\npaths and pre-schedule tokens and experts across devices to losslessly trim\nEP's communication volume. Besides DeepSpeed-MoE, we also build Speculative MoE\ninto a prevailing MoE inference engine SGLang. Experiments show Speculative MoE\ncan significantly boost state-of-the-art MoE inference frameworks on fast\nhomogeneous and slow heterogeneous interconnects.\n","authors":["Yan Li","Pengfei Zheng","Shuang Chen","Zewei Xu","Yunfei Du","Zhengang Wang"],"pdf_url":"https://arxiv.org/pdf/2503.04398v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13483v3","updated":"2025-03-06T12:51:49Z","published":"2025-01-23T08:57:02Z","title":"Robust Amortized Bayesian Inference with Self-Consistency Losses on\n Unlabeled Data","summary":" Neural amortized Bayesian inference (ABI) can solve probabilistic inverse\nproblems orders of magnitude faster than classical methods. However, neural ABI\nis not yet sufficiently robust for widespread and safe applicability. In\nparticular, when performing inference on observations outside of the scope of\nthe simulated data seen during training, for example, because of model\nmisspecification, the posterior approximations are likely to become highly\nbiased. Due to the bad pre-asymptotic behavior of current neural posterior\nestimators in the out-of-simulation regime, the resulting estimation biases\ncannot be fixed in acceptable time by just simulating more training data. In\nthis proof-of-concept paper, we propose a semi-supervised approach that enables\ntraining not only on (labeled) simulated data generated from the model, but\nalso on unlabeled data originating from any source, including real-world data.\nTo achieve the latter, we exploit Bayesian self-consistency properties that can\nbe transformed into strictly proper losses without requiring knowledge of true\nparameter values, that is, without requiring data labels. The results of our\ninitial experiments show remarkable improvements in the robustness of ABI on\nout-of-simulation data. Even if the observed data is far away from both labeled\nand unlabeled training data, inference remains highly accurate. If our findings\nalso generalize to other scenarios and model classes, we believe that our new\nmethod represents a major breakthrough in neural ABI.\n","authors":["Aayush Mishra","Daniel Habermann","Marvin Schmitt","Stefan T. Radev","Paul-Christian Bürkner"],"pdf_url":"https://arxiv.org/pdf/2501.13483v3.pdf","comment":"added acknowledgements"},{"id":"http://arxiv.org/abs/2503.03285v2","updated":"2025-03-06T12:42:37Z","published":"2025-03-05T09:12:16Z","title":"Enhancing Vietnamese VQA through Curriculum Learning on Raw and\n Augmented Text Representations","summary":" Visual Question Answering (VQA) is a multimodal task requiring reasoning\nacross textual and visual inputs, which becomes particularly challenging in\nlow-resource languages like Vietnamese due to linguistic variability and the\nlack of high-quality datasets. Traditional methods often rely heavily on\nextensive annotated datasets, computationally expensive pipelines, and large\npre-trained models, specifically in the domain of Vietnamese VQA, limiting\ntheir applicability in such scenarios. To address these limitations, we propose\na training framework that combines a paraphrase-based feature augmentation\nmodule with a dynamic curriculum learning strategy. Explicitly, augmented\nsamples are considered \"easy\" while raw samples are regarded as \"hard\". The\nframework then utilizes a mechanism that dynamically adjusts the ratio of easy\nto hard samples during training, progressively modifying the same dataset to\nincrease its difficulty level. By enabling gradual adaptation to task\ncomplexity, this approach helps the Vietnamese VQA model generalize well, thus\nimproving overall performance. Experimental results show consistent\nimprovements on the OpenViVQA dataset and mixed outcomes on the ViVQA dataset,\nhighlighting both the potential and challenges of our approach in advancing VQA\nfor Vietnamese language.\n","authors":["Khoi Anh Nguyen","Linh Yen Vu","Thang Dinh Duong","Thuan Nguyen Duong","Huy Thanh Nguyen","Vinh Quang Dinh"],"pdf_url":"https://arxiv.org/pdf/2503.03285v2.pdf","comment":"10 pages, 3 figures, AAAI-25 Workshop on Document Understanding and\n Intelligence"},{"id":"http://arxiv.org/abs/2407.07918v2","updated":"2025-03-06T12:41:21Z","published":"2024-07-07T12:41:40Z","title":"Detecting new obfuscated malware variants: A lightweight and\n interpretable machine learning approach","summary":" Machine learning has been successfully applied in developing malware\ndetection systems, with a primary focus on accuracy, and increasing attention\nto reducing computational overhead and improving model interpretability.\nHowever, an important question remains underexplored: How well can machine\nlearning-based models detect entirely new forms of malware not present in the\ntraining data? In this study, we present a machine learning-based system for\ndetecting obfuscated malware that is not only highly accurate, lightweight and\ninterpretable, but also capable of successfully adapting to new types of\nmalware attacks. Our system is capable of detecting 15 malware subtypes despite\nbeing exclusively trained on one malware subtype, namely the Transponder from\nthe Spyware family. This system was built after training 15 distinct random\nforest-based models, each on a different malware subtype from the\nCIC-MalMem-2022 dataset. These models were evaluated against the entire range\nof malware subtypes, including all unseen malware subtypes. To maintain the\nsystem's streamlined nature, training was confined to the top five most\nimportant features, which also enhanced interpretability. The\nTransponder-focused model exhibited high accuracy, exceeding 99.8%, with an\naverage processing speed of 5.7 microseconds per file. We also illustrate how\nthe Shapley additive explanations technique can facilitate the interpretation\nof the model predictions. Our research contributes to advancing malware\ndetection methodologies, pioneering the feasibility of detecting obfuscated\nmalware by exclusively training a model on a single or a few carefully selected\nmalware subtypes and applying it to detect unseen subtypes.\n","authors":["Oladipo A. Madamidola","Felix Ngobigha","Adnane Ez-zizi"],"pdf_url":"https://arxiv.org/pdf/2407.07918v2.pdf","comment":"30 pages (excluding Appendix), 5 figures and 5 tables. Now published\n in Intelligent Systems with Applications\n (https://doi.org/10.1016/j.iswa.2024.200472)"},{"id":"http://arxiv.org/abs/2503.04386v1","updated":"2025-03-06T12:37:55Z","published":"2025-03-06T12:37:55Z","title":"Time-varying Factor Augmented Vector Autoregression with Grouped Sparse\n Autoencoder","summary":" Recent economic events, including the global financial crisis and COVID-19\npandemic, have exposed limitations in linear Factor Augmented Vector\nAutoregressive (FAVAR) models for forecasting and structural analysis.\nNonlinear dimension techniques, particularly autoencoders, have emerged as\npromising alternatives in a FAVAR framework, but challenges remain in\nidentifiability, interpretability, and integration with traditional nonlinear\ntime series methods. We address these challenges through two contributions.\nFirst, we introduce a Grouped Sparse autoencoder that employs the\nSpike-and-Slab Lasso prior, with parameters under this prior being shared\nacross variables of the same economic category, thereby achieving\nsemi-identifiability and enhancing model interpretability. Second, we\nincorporate time-varying parameters into the VAR component to better capture\nevolving economic dynamics. Our empirical application to the US economy\ndemonstrates that the Grouped Sparse autoencoder produces more interpretable\nfactors through its parsimonious structure; and its combination with\ntime-varying parameter VAR shows superior performance in both point and density\nforecasting. Impulse response analysis reveals that monetary policy shocks\nduring recessions generate more moderate responses with higher uncertainty\ncompared to expansionary periods.\n","authors":["Yiyong Luo","Brooks Paige","Jim Griffin"],"pdf_url":"https://arxiv.org/pdf/2503.04386v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.07527v2","updated":"2025-03-06T12:34:23Z","published":"2025-02-11T13:08:03Z","title":"Nature Language Model: Deciphering the Language of Nature for Scientific\n Discovery","summary":" Foundation models have revolutionized natural language processing and\nartificial intelligence, significantly enhancing how machines comprehend and\ngenerate human languages. Inspired by the success of these foundation models,\nresearchers have developed foundation models for individual scientific domains,\nincluding small molecules, materials, proteins, DNA, RNA and even cells.\nHowever, these models are typically trained in isolation, lacking the ability\nto integrate across different scientific domains. Recognizing that entities\nwithin these domains can all be represented as sequences, which together form\nthe \"language of nature\", we introduce Nature Language Model (NatureLM), a\nsequence-based science foundation model designed for scientific discovery.\nPre-trained with data from multiple scientific domains, NatureLM offers a\nunified, versatile model that enables various applications including: (i)\ngenerating and optimizing small molecules, proteins, RNA, and materials using\ntext instructions; (ii) cross-domain generation/design, such as\nprotein-to-molecule and protein-to-RNA generation; and (iii) top performance\nacross different domains, matching or surpassing state-of-the-art specialist\nmodels. NatureLM offers a promising generalist approach for various scientific\ntasks, including drug discovery (hit generation/optimization, ADMET\noptimization, synthesis), novel material design, and the development of\ntherapeutic proteins or nucleotides. We have developed NatureLM models in\ndifferent sizes (1 billion, 8 billion, and 46.7 billion parameters) and\nobserved a clear improvement in performance as the model size increases.\n","authors":["Yingce Xia","Peiran Jin","Shufang Xie","Liang He","Chuan Cao","Renqian Luo","Guoqing Liu","Yue Wang","Zequn Liu","Yuan-Jyue Chen","Zekun Guo","Yeqi Bai","Pan Deng","Yaosen Min","Ziheng Lu","Hongxia Hao","Han Yang","Jielan Li","Chang Liu","Jia Zhang","Jianwei Zhu","Ran Bi","Kehan Wu","Wei Zhang","Kaiyuan Gao","Qizhi Pei","Qian Wang","Xixian Liu","Yanting Li","Houtian Zhu","Yeqing Lu","Mingqian Ma","Zun Wang","Tian Xie","Krzysztof Maziarz","Marwin Segler","Zhao Yang","Zilong Chen","Yu Shi","Shuxin Zheng","Lijun Wu","Chen Hu","Peggy Dai","Tie-Yan Liu","Haiguang Liu","Tao Qin"],"pdf_url":"https://arxiv.org/pdf/2502.07527v2.pdf","comment":"93 pages"},{"id":"http://arxiv.org/abs/2503.04378v1","updated":"2025-03-06T12:30:24Z","published":"2025-03-06T12:30:24Z","title":"Dedicated Feedback and Edit Models Empower Inference-Time Scaling for\n Open-Ended General-Domain Tasks","summary":" Inference-Time Scaling has been critical to the success of recent models such\nas OpenAI o1 and DeepSeek R1. However, many techniques used to train models for\ninference-time scaling require tasks to have answers that can be verified,\nlimiting their application to domains such as math, coding and logical\nreasoning. We take inspiration from how humans make first attempts, ask for\ndetailed feedback from others and make improvements based on such feedback\nacross a wide spectrum of open-ended endeavors. To this end, we collect data\nfor and train dedicated Feedback and Edit Models that are capable of performing\ninference-time scaling for open-ended general-domain tasks. In our setup, one\nmodel generates an initial response, which are given feedback by a second\nmodel, that are then used by a third model to edit the response. We show that\nperformance on Arena Hard, a benchmark strongly predictive of Chatbot Arena Elo\ncan be boosted by scaling the number of initial response drafts, effective\nfeedback and edited responses. When scaled optimally, our setup based on 70B\nmodels from the Llama 3 family can reach SoTA performance on Arena Hard at 92.7\nas of 5 Mar 2025, surpassing OpenAI o1-preview-2024-09-12 with 90.4 and\nDeepSeek R1 with 92.3.\n","authors":["Zhilin Wang","Jiaqi Zeng","Olivier Delalleau","Daniel Egert","Ellie Evans","Hoo-Chang Shin","Felipe Soares","Yi Dong","Oleksii Kuchaiev"],"pdf_url":"https://arxiv.org/pdf/2503.04378v1.pdf","comment":"22 pages, 2 figures"},{"id":"http://arxiv.org/abs/2503.04377v1","updated":"2025-03-06T12:28:59Z","published":"2025-03-06T12:28:59Z","title":"How can representation dimension dominate structurally pruned LLMs?","summary":" Pruning assumes a subnetwork exists in the original deep neural network,\nwhich can achieve comparative model performance with less computation than the\noriginal. However, it is unclear how the model performance varies with the\ndifferent subnetwork extractions. In this paper, we choose the representation\ndimension (or embedding dimension, model dimension, the dimension of the\nresidual stream in the relevant literature) as the entry point to this issue.\nWe investigate the linear transformations in the LLM transformer blocks and\nconsider a specific structured pruning approach, SliceGPT, to extract the\nsubnetworks of different representation dimensions. We mechanistically analyse\nthe activation flow during the model forward passes, and find the\nrepresentation dimension dominates the linear transformations, model\npredictions, and, finally, the model performance. Explicit analytical relations\nare given to calculate the pruned model performance (perplexity and accuracy)\nwithout actual evaluation, and are empirically validated with\nLlama-3-8B-Instruct and Phi-3-mini-4k-Instruct.\n","authors":["Mingxue Xu","Lisa Alazraki","Danilo P. Mandic"],"pdf_url":"https://arxiv.org/pdf/2503.04377v1.pdf","comment":"ICLR 2025 Workshop on Sparsity in LLMs (SLLM)"},{"id":"http://arxiv.org/abs/2502.16532v2","updated":"2025-03-06T12:19:59Z","published":"2025-02-23T10:48:11Z","title":"Deep unrolling for learning optimal spatially varying regularisation\n parameters for Total Generalised Variation","summary":" We extend a recently introduced deep unrolling framework for learning\nspatially varying regularisation parameters in inverse imaging problems to the\ncase of Total Generalised Variation (TGV). The framework combines a deep\nconvolutional neural network (CNN) inferring the two spatially varying TGV\nparameters with an unrolled algorithmic scheme that solves the corresponding\nvariational problem. The two subnetworks are jointly trained end-to-end in a\nsupervised fashion and as such the CNN learns to compute those parameters that\ndrive the reconstructed images as close to the ground truth as possible.\nNumerical results in image denoising and MRI reconstruction show a significant\nqualitative and quantitative improvement compared to the best TGV scalar\nparameter case as well as to other approaches employing spatially varying\nparameters computed by unsupervised methods. We also observe that the inferred\nspatially varying parameter maps have a consistent structure near the image\nedges, asking for further theoretical investigations. In particular, the\nparameter that weighs the first-order TGV term has a triple-edge structure with\nalternating high-low-high values whereas the one that weighs the second-order\nterm attains small values in a large neighbourhood around the edges.\n","authors":["Thanh Trung Vu","Andreas Kofler","Kostas Papafitsoros"],"pdf_url":"https://arxiv.org/pdf/2502.16532v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04370v1","updated":"2025-03-06T12:15:56Z","published":"2025-03-06T12:15:56Z","title":"FILM: Framework for Imbalanced Learning Machines based on a new unbiased\n performance measure and a new ensemble-based technique","summary":" This research addresses the challenges of handling unbalanced datasets for\nbinary classification tasks. In such scenarios, standard evaluation metrics are\noften biased by the disproportionate representation of the minority class.\nConducting experiments across seven datasets, we uncovered inconsistencies in\nevaluation metrics when determining the model that outperforms others for each\nbinary classification problem. This justifies the need for a metric that\nprovides a more consistent and unbiased evaluation across unbalanced datasets,\nthereby supporting robust model selection. To mitigate this problem, we propose\na novel metric, the Unbiased Integration Coefficients (UIC), which exhibits\nsignificantly reduced bias ($p < 10^{-4}$) towards the minority class compared\nto conventional metrics. The UIC is constructed by aggregating existing metrics\nwhile penalising those more prone to imbalance. In addition, we introduce the\nIdentical Partitions for Imbalance Problems (IPIP) algorithm for imbalanced ML\nproblems, an ensemble-based approach. Our experimental results show that IPIP\noutperforms other baseline imbalance-aware approaches using Random Forest and\nLogistic Regression models in three out of seven datasets as assessed by the\nUIC metric, demonstrating its effectiveness in addressing imbalanced data\nchallenges in binary classification tasks. This new framework for dealing with\nimbalanced datasets is materialized in the FILM (Framework for Imbalanced\nLearning Machines) R Package, accessible at https://github.com/antoniogt/FILM.\n","authors":["Antonio Guillén-Teruel","Marcos Caracena","Jose A. Pardo","Fernando de-la-Gándara","José Palma","Juan A. Botía"],"pdf_url":"https://arxiv.org/pdf/2503.04370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.01257v2","updated":"2025-03-06T12:13:14Z","published":"2024-10-02T06:05:52Z","title":"HelpSteer2-Preference: Complementing Ratings with Preferences","summary":" Reward models are critical for aligning models to follow instructions, and\nare typically trained following one of two popular paradigms: Bradley-Terry\nstyle or Regression style. However, there is a lack of evidence that either\napproach is better than the other, when adequately matched for data. This is\nprimarily because these approaches require data collected in different (but\nincompatible) formats, meaning that adequately matched data is not available in\nexisting public datasets. To tackle this problem, we release preference\nannotations (designed for Bradley-Terry training) to complement existing\nratings (designed for Regression style training) in the HelpSteer2 dataset. To\nimprove data interpretability, preference annotations are accompanied with\nhuman-written justifications. Using this data, we conduct the first\nhead-to-head comparison of Bradley-Terry and Regression models when adequately\nmatched for data. Based on insights derived from such a comparison, we propose\na novel approach to combine Bradley-Terry and Regression reward modeling. A\nLlama-3.1-70B-Instruct model tuned with this approach scores 94.1 on\nRewardBench, emerging top of more than 140 reward models as of 1 Oct 2024. This\nreward model can then be used with REINFORCE algorithm (RLHF) to align an\nInstruct model to reach 85.0 on Arena Hard, which is No. 1 as of 1 Oct 2024. We\nopen-source this dataset (CC-BY-4.0 license) at\nhttps://huggingface.co/datasets/nvidia/HelpSteer2#preferences-new -- 1-oct-2024\nand openly release the trained Reward and Instruct models at\nhttps://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Reward and\nhttps://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct\n","authors":["Zhilin Wang","Alexander Bukharin","Olivier Delalleau","Daniel Egert","Gerald Shen","Jiaqi Zeng","Oleksii Kuchaiev","Yi Dong"],"pdf_url":"https://arxiv.org/pdf/2410.01257v2.pdf","comment":"Accepted to ICLR 2025; 28 pages, 3 figures"},{"id":"http://arxiv.org/abs/2503.04363v1","updated":"2025-03-06T12:06:54Z","published":"2025-03-06T12:06:54Z","title":"Causally Reliable Concept Bottleneck Models","summary":" Concept-based models are an emerging paradigm in deep learning that\nconstrains the inference process to operate through human-interpretable\nconcepts, facilitating explainability and human interaction. However, these\narchitectures, on par with popular opaque neural models, fail to account for\nthe true causal mechanisms underlying the target phenomena represented in the\ndata. This hampers their ability to support causal reasoning tasks, limits\nout-of-distribution generalization, and hinders the implementation of fairness\nconstraints. To overcome these issues, we propose \\emph{Causally reliable\nConcept Bottleneck Models} (C$^2$BMs), a class of concept-based architectures\nthat enforce reasoning through a bottleneck of concepts structured according to\na model of the real-world causal mechanisms. We also introduce a pipeline to\nautomatically learn this structure from observational data and\n\\emph{unstructured} background knowledge (e.g., scientific literature).\nExperimental evidence suggest that C$^2$BM are more interpretable, causally\nreliable, and improve responsiveness to interventions w.r.t. standard opaque\nand concept-based models, while maintaining their accuracy.\n","authors":["Giovanni De Felice","Arianna Casanova Flores","Francesco De Santis","Silvia Santini","Johannes Schneider","Pietro Barbiero","Alberto Termine"],"pdf_url":"https://arxiv.org/pdf/2503.04363v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04362v1","updated":"2025-03-06T12:04:56Z","published":"2025-03-06T12:04:56Z","title":"A Generalist Cross-Domain Molecular Learning Framework for\n Structure-Based Drug Discovery","summary":" Structure-based drug discovery (SBDD) is a systematic scientific process that\ndevelops new drugs by leveraging the detailed physical structure of the target\nprotein. Recent advancements in pre-trained models for biomolecules have\ndemonstrated remarkable success across various biochemical applications,\nincluding drug discovery and protein engineering. However, in most approaches,\nthe pre-trained models primarily focus on the characteristics of either small\nmolecules or proteins, without delving into their binding interactions which\nare essential cross-domain relationships pivotal to SBDD. To fill this gap, we\npropose a general-purpose foundation model named BIT (an abbreviation for\nBiomolecular Interaction Transformer), which is capable of encoding a range of\nbiochemical entities, including small molecules, proteins, and protein-ligand\ncomplexes, as well as various data formats, encompassing both 2D and 3D\nstructures. Specifically, we introduce Mixture-of-Domain-Experts (MoDE) to\nhandle the biomolecules from diverse biochemical domains and\nMixture-of-Structure-Experts (MoSE) to capture positional dependencies in the\nmolecular structures. The proposed mixture-of-experts approach enables BIT to\nachieve both deep fusion and domain-specific encoding, effectively capturing\nfine-grained molecular interactions within protein-ligand complexes. Then, we\nperform cross-domain pre-training on the shared Transformer backbone via\nseveral unified self-supervised denoising tasks. Experimental results on\nvarious benchmarks demonstrate that BIT achieves exceptional performance in\ndownstream tasks, including binding affinity prediction, structure-based\nvirtual screening, and molecular property prediction.\n","authors":["Yiheng Zhu","Mingyang Li","Junlong Liu","Kun Fu","Jiansheng Wu","Qiuyi Li","Mingze Yin","Jieping Ye","Jian Wu","Zheng Wang"],"pdf_url":"https://arxiv.org/pdf/2503.04362v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04358v1","updated":"2025-03-06T12:01:41Z","published":"2025-03-06T12:01:41Z","title":"Learning Causal Response Representations through Direct Effect Analysis","summary":" We propose a novel approach for learning causal response representations. Our\nmethod aims to extract directions in which a multidimensional outcome is most\ndirectly caused by a treatment variable. By bridging conditional independence\ntesting with causal representation learning, we formulate an optimisation\nproblem that maximises the evidence against conditional independence between\nthe treatment and outcome, given a conditioning set. This formulation employs\nflexible regression models tailored to specific applications, creating a\nversatile framework. The problem is addressed through a generalised eigenvalue\ndecomposition. We show that, under mild assumptions, the distribution of the\nlargest eigenvalue can be bounded by a known $F$-distribution, enabling\ntestable conditional independence. We also provide theoretical guarantees for\nthe optimality of the learned representation in terms of signal-to-noise ratio\nand Fisher information maximisation. Finally, we demonstrate the empirical\neffectiveness of our approach in simulation and real-world experiments. Our\nresults underscore the utility of this framework in uncovering direct causal\neffects within complex, multivariate settings.\n","authors":["Homer Durand","Gherardo Varando","Gustau Camps-Valls"],"pdf_url":"https://arxiv.org/pdf/2503.04358v1.pdf","comment":"32 pages, 15 figures, stat.ML"},{"id":"http://arxiv.org/abs/2503.04357v1","updated":"2025-03-06T12:01:20Z","published":"2025-03-06T12:01:20Z","title":"scDD: Latent Codes Based scRNA-seq Dataset Distillation with Foundation\n Model Knowledge","summary":" Single-cell RNA sequencing (scRNA-seq) technology has profiled hundreds of\nmillions of human cells across organs, diseases, development and perturbations\nto date. However, the high-dimensional sparsity, batch effect noise, category\nimbalance, and ever-increasing data scale of the original sequencing data pose\nsignificant challenges for multi-center knowledge transfer, data fusion, and\ncross-validation between scRNA-seq datasets. To address these barriers, (1) we\nfirst propose a latent codes-based scRNA-seq dataset distillation framework\nnamed scDD, which transfers and distills foundation model knowledge and\noriginal dataset information into a compact latent space and generates\nsynthetic scRNA-seq dataset by a generator to replace the original dataset.\nThen, (2) we propose a single-step conditional diffusion generator named SCDG,\nwhich perform single-step gradient back-propagation to help scDD optimize\ndistillation quality and avoid gradient decay caused by multi-step\nback-propagation. Meanwhile, SCDG ensures the scRNA-seq data characteristics\nand inter-class discriminability of the synthetic dataset through flexible\nconditional control and generation quality assurance. Finally, we propose a\ncomprehensive benchmark to evaluate the performance of scRNA-seq dataset\ndistillation in different data analysis tasks. It is validated that our\nproposed method can achieve 7.61% absolute and 15.70% relative improvement over\nprevious state-of-the-art methods on average task.\n","authors":["Zhen Yu","Jianan Han","Yang Liu","Qingchao Chen"],"pdf_url":"https://arxiv.org/pdf/2503.04357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04350v1","updated":"2025-03-06T11:46:07Z","published":"2025-03-06T11:46:07Z","title":"EDCA -- An Evolutionary Data-Centric AutoML Framework for Efficient\n Pipelines","summary":" Automated Machine Learning (AutoML) gained popularity due to the increased\ndemand for Machine Learning (ML) specialists, allowing them to apply ML\ntechniques effortlessly and quickly. AutoML implementations use optimisation\nmethods to identify the most effective ML solution for a given dataset, aiming\nto improve one or more predefined metrics. However, most implementations focus\non model selection and hyperparameter tuning. Despite being an important factor\nin obtaining high-performance ML systems, data quality is usually an overlooked\npart of AutoML and continues to be a manual and time-consuming task. This work\npresents EDCA, an Evolutionary Data Centric AutoML framework. In addition to\nthe traditional tasks such as selecting the best models and hyperparameters,\nEDCA enhances the given data by optimising data processing tasks such as data\nreduction and cleaning according to the problems' needs. All these steps create\nan ML pipeline that is optimised by an evolutionary algorithm. To assess its\neffectiveness, EDCA was compared to FLAML and TPOT, two frameworks at the top\nof the AutoML benchmarks. The frameworks were evaluated in the same conditions\nusing datasets from AMLB classification benchmarks. EDCA achieved statistically\nsimilar results in performance to FLAML and TPOT but used significantly less\ndata to train the final solutions. Moreover, EDCA experimental results reveal\nthat a good performance can be achieved using less data and efficient ML\nalgorithm aspects that align with Green AutoML guidelines\n","authors":["Joana Simões","João Correia"],"pdf_url":"https://arxiv.org/pdf/2503.04350v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04347v1","updated":"2025-03-06T11:43:30Z","published":"2025-03-06T11:43:30Z","title":"Large Language Models for Zero-shot Inference of Causal Structures in\n Biology","summary":" Genes, proteins and other biological entities influence one another via\ncausal molecular networks. Causal relationships in such networks are mediated\nby complex and diverse mechanisms, through latent variables, and are often\nspecific to cellular context. It remains challenging to characterise such\nnetworks in practice. Here, we present a novel framework to evaluate large\nlanguage models (LLMs) for zero-shot inference of causal relationships in\nbiology. In particular, we systematically evaluate causal claims obtained from\nan LLM using real-world interventional data. This is done over one hundred\nvariables and thousands of causal hypotheses. Furthermore, we consider several\nprompting and retrieval-augmentation strategies, including large, and\npotentially conflicting, collections of scientific articles. Our results show\nthat with tailored augmentation and prompting, even relatively small LLMs can\ncapture meaningful aspects of causal structure in biological systems. This\nsupports the notion that LLMs could act as orchestration tools in biological\ndiscovery, by helping to distil current knowledge in ways amenable to\ndownstream analysis. Our approach to assessing LLMs with respect to\nexperimental data is relevant for a broad range of problems at the intersection\nof causal learning, LLMs and scientific discovery.\n","authors":["Izzy Newsham","Luka Kovačević","Richard Moulange","Nan Rosemary Ke","Sach Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2503.04347v1.pdf","comment":"ICLR 2025 Workshop on Machine Learning for Genomics Explorations"},{"id":"http://arxiv.org/abs/2503.04342v1","updated":"2025-03-06T11:39:07Z","published":"2025-03-06T11:39:07Z","title":"TRANSIT your events into a new mass: Fast background interpolation for\n weakly-supervised anomaly searches","summary":" We introduce a new model for conditional and continuous data morphing called\nTRansport Adversarial Network for Smooth InTerpolation (TRANSIT). We apply it\nto create a background data template for weakly-supervised searches at the LHC.\nThe method smoothly transforms sideband events to match signal region mass\ndistributions. We demonstrate the performance of TRANSIT using the LHC Olympics\nR\\&D dataset. The model captures non-linear mass correlations of features and\nproduces a template that offers a competitive anomaly sensitivity compared to\nstate-of-the-art transport-based template generators. Moreover, the\ncomputational training time required for TRANSIT is an order of magnitude lower\nthan that of competing deep learning methods. This makes it ideal for analyses\nthat iterate over many signal regions and signal models. Unlike generative\nmodels, which must learn a full probability density distribution, i.e., the\ncorrelations between all the variables, the proposed transport model only has\nto learn a smooth conditional shift of the distribution. This allows for a\nsimpler, more efficient residual architecture, enabling mass uncorrelated\nfeatures to pass the network unchanged while the mass correlated features are\nadjusted accordingly. Furthermore, we show that the latent space of the model\nprovides a set of mass decorrelated features useful for anomaly detection\nwithout background sculpting.\n","authors":["Ivan Oleksiyuk","Svyatoslav Voloshynovskiy","Tobias Golling"],"pdf_url":"https://arxiv.org/pdf/2503.04342v1.pdf","comment":"34 pages, 14 figures"},{"id":"http://arxiv.org/abs/2401.13898v2","updated":"2025-03-06T11:38:00Z","published":"2024-01-25T02:25:23Z","title":"Cross-Modal Prototype based Multimodal Federated Learning under Severely\n Missing Modality","summary":" Multimodal federated learning (MFL) has emerged as a decentralized machine\nlearning paradigm, allowing multiple clients with different modalities to\ncollaborate on training a global model across diverse data sources without\nsharing their private data. However, challenges, such as data heterogeneity and\nseverely missing modalities, pose crucial hindrances to the robustness of MFL,\nsignificantly impacting the performance of global model. The occurrence of\nmissing modalities in real-world applications, such as autonomous driving,\noften arises from factors like sensor failures, leading knowledge gaps during\nthe training process. Specifically, the absence of a modality introduces\nmisalignment during the local training phase, stemming from zero-filling in the\ncase of clients with missing modalities. Consequently, achieving robust\ngeneralization in global model becomes imperative, especially when dealing with\nclients that have incomplete data. In this paper, we propose\n$\\textbf{Multimodal Federated Cross Prototype Learning (MFCPL)}$, a novel\napproach for MFL under severely missing modalities. Our MFCPL leverages the\ncomplete prototypes to provide diverse modality knowledge in modality-shared\nlevel with the cross-modal regularization and modality-specific level with\ncross-modal contrastive mechanism. Additionally, our approach introduces the\ncross-modal alignment to provide regularization for modality-specific features,\nthereby enhancing the overall performance, particularly in scenarios involving\nseverely missing modalities. Through extensive experiments on three multimodal\ndatasets, we demonstrate the effectiveness of MFCPL in mitigating the\nchallenges of data heterogeneity and severely missing modalities while\nimproving the overall performance and robustness of MFL.\n","authors":["Huy Q. Le","Chu Myaet Thwal","Yu Qiao","Ye Lin Tun","Minh N. H. Nguyen","Eui-Nam Huh","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2401.13898v2.pdf","comment":"14 pages, 8 figures, 11 tables"},{"id":"http://arxiv.org/abs/2401.12113v2","updated":"2025-03-06T11:33:28Z","published":"2024-01-22T16:51:01Z","title":"Extracting Formulae in Many-Valued Logic from Deep Neural Networks","summary":" We propose a new perspective on deep ReLU networks, namely as circuit\ncounterparts of Lukasiewicz infinite-valued logic -- a many-valued (MV)\ngeneralization of Boolean logic. An algorithm for extracting formulae in MV\nlogic from deep ReLU networks is presented. As the algorithm applies to\nnetworks with general, in particular also real-valued, weights, it can be used\nto extract logical formulae from deep ReLU networks trained on data.\n","authors":["Yani Zhang","Helmut Bölcskei"],"pdf_url":"https://arxiv.org/pdf/2401.12113v2.pdf","comment":"Signicant extension of the previous version"},{"id":"http://arxiv.org/abs/2503.04332v1","updated":"2025-03-06T11:30:32Z","published":"2025-03-06T11:30:32Z","title":"The Challenge of Identifying the Origin of Black-Box Large Language\n Models","summary":" The tremendous commercial potential of large language models (LLMs) has\nheightened concerns about their unauthorized use. Third parties can customize\nLLMs through fine-tuning and offer only black-box API access, effectively\nconcealing unauthorized usage and complicating external auditing processes.\nThis practice not only exacerbates unfair competition, but also violates\nlicensing agreements. In response, identifying the origin of black-box LLMs is\nan intrinsic solution to this issue. In this paper, we first reveal the\nlimitations of state-of-the-art passive and proactive identification methods\nwith experiments on 30 LLMs and two real-world black-box APIs. Then, we propose\nthe proactive technique, PlugAE, which optimizes adversarial token embeddings\nin a continuous space and proactively plugs them into the LLM for tracing and\nidentification. The experiments show that PlugAE can achieve substantial\nimprovement in identifying fine-tuned derivatives. We further advocate for\nlegal frameworks and regulations to better address the challenges posed by the\nunauthorized use of LLMs.\n","authors":["Ziqing Yang","Yixin Wu","Yun Shen","Wei Dai","Michael Backes","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2503.04332v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00390v2","updated":"2025-03-06T11:21:39Z","published":"2024-03-30T15:03:52Z","title":"Learning truly monotone operators with applications to nonlinear inverse\n problems","summary":" This article introduces a novel approach to learning monotone neural networks\nthrough a newly defined penalization loss. The proposed method is particularly\neffective in solving classes of variational problems, specifically monotone\ninclusion problems, commonly encountered in image processing tasks. The\nForward-Backward-Forward (FBF) algorithm is employed to address these problems,\noffering a solution even when the Lipschitz constant of the neural network is\nunknown. Notably, the FBF algorithm provides convergence guarantees under the\ncondition that the learned operator is monotone. Building on plug-and-play\nmethodologies, our objective is to apply these newly learned operators to\nsolving non-linear inverse problems. To achieve this, we initially formulate\nthe problem as a variational inclusion problem. Subsequently, we train a\nmonotone neural network to approximate an operator that may not inherently be\nmonotone. Leveraging the FBF algorithm, we then show simulation examples where\nthe non-linear inverse problem is successfully solved.\n","authors":["Younes Belkouchi","Jean-Christophe Pesquet","Audrey Repetti","Hugues Talbot"],"pdf_url":"https://arxiv.org/pdf/2404.00390v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05714v4","updated":"2025-03-06T11:20:32Z","published":"2024-06-09T10:12:08Z","title":"A conversion theorem and minimax optimality for continuum contextual\n bandits","summary":" We study the contextual continuum bandits problem, where the learner\nsequentially receives a side information vector and has to choose an action in\na convex set, minimizing a function associated with the context. The goal is to\nminimize all the underlying functions for the received contexts, leading to the\ncontextual notion of regret, which is stronger than the standard static regret.\nAssuming that the objective functions are $\\gamma$-H\\\"older with respect to the\ncontexts, $0<\\gamma\\le 1,$ we demonstrate that any algorithm achieving a\nsub-linear static regret can be extended to achieve a sub-linear contextual\nregret. We prove a static-to-contextual regret conversion theorem that provides\nan upper bound for the contextual regret of the output algorithm as a function\nof the static regret of the input algorithm. We further study the implications\nof this general result for three fundamental cases of dependency of the\nobjective function on the action variable: (a) Lipschitz bandits, (b) convex\nbandits, (c) strongly convex and smooth bandits. For Lipschitz bandits and\n$\\gamma=1,$ combining our results with the lower bound of Slivkins (2014), we\nprove that the minimax optimal contextual regret for the noise-free adversarial\nsetting is achieved. Then, we prove that in the presence of noise, the\ncontextual regret rate as a function of the number of queries is the same for\nconvex bandits as it is for strongly convex and smooth bandits. Lastly, we\npresent a minimax lower bound, implying two key facts. First, obtaining a\nsub-linear contextual regret may be impossible over functions that are not\ncontinuous with respect to the context. Second, for convex bandits and strongly\nconvex and smooth bandits, the algorithms that we propose achieve, up to a\nlogarithmic factor, the minimax optimal rate of contextual regret as a function\nof the number of queries.\n","authors":["Arya Akhavan","Karim Lounici","Massimiliano Pontil","Alexandre B. Tsybakov"],"pdf_url":"https://arxiv.org/pdf/2406.05714v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.17634v2","updated":"2025-03-06T11:17:31Z","published":"2025-01-29T13:11:21Z","title":"Federated Learning With Individualized Privacy Through Client Sampling","summary":" With growing concerns about user data collection, individualized privacy has\nemerged as a promising solution to balance protection and utility by accounting\nfor diverse user privacy preferences. Instead of enforcing a uniform level of\nanonymization for all users, this approach allows individuals to choose privacy\nsettings that align with their comfort levels. Building on this idea, we\npropose an adapted method for enabling Individualized Differential Privacy\n(IDP) in Federated Learning (FL) by handling clients according to their\npersonal privacy preferences. By extending the SAMPLE algorithm from\ncentralized settings to FL, we calculate client-specific sampling rates based\non their heterogeneous privacy budgets and integrate them into a modified\nIDP-FedAvg algorithm. We test this method under realistic privacy distributions\nand multiple datasets. The experimental results demonstrate that our approach\nachieves clear improvements over uniform DP baselines, reducing the trade-off\nbetween privacy and utility. Compared to the alternative SCALE method in\nrelated work, which assigns differing noise scales to clients, our method\nperforms notably better. However, challenges remain for complex tasks with\nnon-i.i.d. data, primarily stemming from the constraints of the decentralized\nsetting.\n","authors":["Lucas Lange","Ole Borchardt","Erhard Rahm"],"pdf_url":"https://arxiv.org/pdf/2501.17634v2.pdf","comment":"Accepted at 10th International Conference on Machine Learning\n Technologies (ICMLT 2025)"},{"id":"http://arxiv.org/abs/2402.03448v4","updated":"2025-03-06T11:07:54Z","published":"2024-02-05T19:02:19Z","title":"Decentralized Sporadic Federated Learning: A Unified Algorithmic\n Framework with Convergence Guarantees","summary":" Decentralized federated learning (DFL) captures FL settings where both (i)\nmodel updates and (ii) model aggregations are exclusively carried out by the\nclients without a central server. Existing DFL works have mostly focused on\nsettings where clients conduct a fixed number of local updates between local\nmodel exchanges, overlooking heterogeneity and dynamics in communication and\ncomputation capabilities. In this work, we propose Decentralized Sporadic\nFederated Learning ($\\texttt{DSpodFL}$), a DFL methodology built on a\ngeneralized notion of $\\textit{sporadicity}$ in both local gradient and\naggregation processes. $\\texttt{DSpodFL}$ subsumes many existing decentralized\noptimization methods under a unified algorithmic framework by modeling the\nper-iteration (i) occurrence of gradient descent at each client and (ii)\nexchange of models between client pairs as arbitrary indicator random\nvariables, thus capturing $\\textit{heterogeneous and time-varying}$\ncomputation/communication scenarios. We analytically characterize the\nconvergence behavior of $\\texttt{DSpodFL}$ for both convex and non-convex\nmodels and for both constant and diminishing learning rates, under mild\nassumptions on the communication graph connectivity, data heterogeneity across\nclients, and gradient noises. We show how our bounds recover existing results\nfrom decentralized gradient descent as special cases. Experiments demonstrate\nthat $\\texttt{DSpodFL}$ consistently achieves improved training speeds compared\nwith baselines under various system settings.\n","authors":["Shahryar Zehtabi","Dong-Jun Han","Rohit Parasnis","Seyyedali Hosseinalipour","Christopher G. Brinton"],"pdf_url":"https://arxiv.org/pdf/2402.03448v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2502.15109v2","updated":"2025-03-06T11:07:48Z","published":"2025-02-21T00:05:40Z","title":"Social Genome: Grounded Social Reasoning Abilities of Multimodal Models","summary":" Social reasoning abilities are crucial for AI systems to effectively\ninterpret and respond to multimodal human communication and interaction within\nsocial contexts. We introduce Social Genome, the first benchmark for\nfine-grained, grounded social reasoning abilities of multimodal models. Social\nGenome contains 272 videos of interactions and 1,486 human-annotated reasoning\ntraces related to inferences about these interactions. These traces contain\n5,777 reasoning steps that reference evidence from visual cues, verbal cues,\nvocal cues, and external knowledge (contextual knowledge external to videos).\nSocial Genome is also the first modeling challenge to study external knowledge\nin social reasoning. Social Genome computes metrics to holistically evaluate\nsemantic and structural qualities of model-generated social reasoning traces.\nWe demonstrate the utility of Social Genome through experiments with\nstate-of-the-art models, identifying performance gaps and opportunities for\nfuture research to improve the grounded social reasoning abilities of\nmultimodal models.\n","authors":["Leena Mathur","Marian Qian","Paul Pu Liang","Louis-Philippe Morency"],"pdf_url":"https://arxiv.org/pdf/2502.15109v2.pdf","comment":"Under Review, 22 pages"},{"id":"http://arxiv.org/abs/2402.01879v3","updated":"2025-03-06T11:05:33Z","published":"2024-02-02T20:08:11Z","title":"$σ$-zero: Gradient-based Optimization of $\\ell_0$-norm Adversarial\n Examples","summary":" Evaluating the adversarial robustness of deep networks to gradient-based\nattacks is challenging. While most attacks consider $\\ell_2$- and\n$\\ell_\\infty$-norm constraints to craft input perturbations, only a few\ninvestigate sparse $\\ell_1$- and $\\ell_0$-norm attacks. In particular,\n$\\ell_0$-norm attacks remain the least studied due to the inherent complexity\nof optimizing over a non-convex and non-differentiable constraint. However,\nevaluating adversarial robustness under these attacks could reveal weaknesses\notherwise left untested with more conventional $\\ell_2$- and $\\ell_\\infty$-norm\nattacks. In this work, we propose a novel $\\ell_0$-norm attack, called\n$\\sigma$-zero, which leverages a differentiable approximation of the $\\ell_0$\nnorm to facilitate gradient-based optimization, and an adaptive projection\noperator to dynamically adjust the trade-off between loss minimization and\nperturbation sparsity. Extensive evaluations using MNIST, CIFAR10, and ImageNet\ndatasets, involving robust and non-robust models, show that\n$\\sigma$\\texttt{-zero} finds minimum $\\ell_0$-norm adversarial examples without\nrequiring any time-consuming hyperparameter tuning, and that it outperforms all\ncompeting sparse attacks in terms of success rate, perturbation size, and\nefficiency.\n","authors":["Antonio Emanuele Cinà","Francesco Villani","Maura Pintor","Lea Schönherr","Battista Biggio","Marcello Pelillo"],"pdf_url":"https://arxiv.org/pdf/2402.01879v3.pdf","comment":"Paper accepted at International Conference on Learning\n Representations (ICLR 2025). Code available at\n https://github.com/sigma0-advx/sigma-zero"},{"id":"http://arxiv.org/abs/2412.00156v3","updated":"2025-03-06T11:05:32Z","published":"2024-11-29T08:10:49Z","title":"VISION-XL: High Definition Video Inverse Problem Solver using Latent\n Image Diffusion Models","summary":" In this paper, we propose a novel framework for solving high-definition video\ninverse problems using latent image diffusion models. Building on recent\nadvancements in spatio-temporal optimization for video inverse problems using\nimage diffusion models, our approach leverages latent-space diffusion models to\nachieve enhanced video quality and resolution. To address the high\ncomputational demands of processing high-resolution frames, we introduce a\npseudo-batch consistent sampling strategy, allowing efficient operation on a\nsingle GPU. Additionally, to improve temporal consistency, we present\npseudo-batch inversion, an initialization technique that incorporates\ninformative latents from the measurement. By integrating with SDXL, our\nframework achieves state-of-the-art video reconstruction across a wide range of\nspatio-temporal inverse problems, including complex combinations of frame\naveraging and various spatial degradations, such as deblurring,\nsuper-resolution, and inpainting. Unlike previous methods, our approach\nsupports multiple aspect ratios (landscape, vertical, and square) and delivers\nHD-resolution reconstructions (exceeding 1280x720) in under 6 seconds per frame\non a single NVIDIA 4090 GPU.\n","authors":["Taesung Kwon","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2412.00156v3.pdf","comment":"Project page: https://vision-xl.github.io/"},{"id":"http://arxiv.org/abs/2501.10814v2","updated":"2025-03-06T11:05:23Z","published":"2025-01-18T16:23:09Z","title":"No More Sliding Window: Efficient 3D Medical Image Segmentation with\n Differentiable Top-k Patch Sampling","summary":" 3D models surpass 2D models in CT/MRI segmentation by effectively capturing\ninter-slice relationships. However, the added depth dimension substantially\nincreases memory consumption. While patch-based training alleviates memory\nconstraints, it significantly slows down the inference speed due to the sliding\nwindow (SW) approach. We propose No-More-Sliding-Window (NMSW), a novel\nend-to-end trainable framework that enhances the efficiency of generic 3D\nsegmentation backbone during an inference step by eliminating the need for SW.\nNMSW employs a differentiable Top-k module to selectively sample only the most\nrelevant patches, thereby minimizing redundant computations. When patch-level\npredictions are insufficient, the framework intelligently leverages coarse\nglobal predictions to refine results. Evaluated across 3 tasks using 3\nsegmentation backbones, NMSW achieves competitive accuracy compared to SW\ninference while significantly reducing computational complexity by 91% (88.0 to\n8.00 TMACs). Moreover, it delivers a 9.1x faster inference on the H100 GPU\n(99.0 to 8.3 sec) and a 11.1x faster inference on the Xeon Gold CPU (2110 to\n189 sec). NMSW is model-agnostic, further boosting efficiency when integrated\nwith any existing efficient segmentation backbones.\n","authors":["Young Seok Jeon","Hongfei Yang","Huazhu Fu","Mengling Feng"],"pdf_url":"https://arxiv.org/pdf/2501.10814v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04318v1","updated":"2025-03-06T11:00:18Z","published":"2025-03-06T11:00:18Z","title":"InFL-UX: A Toolkit for Web-Based Interactive Federated Learning","summary":" This paper presents InFL-UX, an interactive, proof-of-concept browser-based\nFederated Learning (FL) toolkit designed to integrate user contributions\nseamlessly into the machine learning (ML) workflow. InFL-UX enables users\nacross multiple devices to upload datasets, define classes, and collaboratively\ntrain classification models directly in the browser using modern web\ntechnologies. Unlike traditional FL toolkits, which often focus on backend\nsimulations, InFL-UX provides a simple user interface for researchers to\nexplore how users interact with and contribute to FL systems in real-world,\ninteractive settings. By prioritising usability and decentralised model\ntraining, InFL-UX bridges the gap between FL and Interactive Machine Learning\n(IML), empowering non-technical users to actively participate in ML\nclassification tasks.\n","authors":["Tim Maurer","Abdulrahman Mohamed Selim","Hasan Md Tusfiqur Alam","Matthias Eiletz","Michael Barz","Daniel Sonntag"],"pdf_url":"https://arxiv.org/pdf/2503.04318v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04315v1","updated":"2025-03-06T10:58:35Z","published":"2025-03-06T10:58:35Z","title":"Provable Robust Overfitting Mitigation in Wasserstein Distributionally\n Robust Optimization","summary":" Wasserstein distributionally robust optimization (WDRO) optimizes against\nworst-case distributional shifts within a specified uncertainty set, leading to\nenhanced generalization on unseen adversarial examples, compared to standard\nadversarial training which focuses on pointwise adversarial perturbations.\nHowever, WDRO still suffers fundamentally from the robust overfitting problem,\nas it does not consider statistical error. We address this gap by proposing a\nnovel robust optimization framework under a new uncertainty set for adversarial\nnoise via Wasserstein distance and statistical error via Kullback-Leibler\ndivergence, called the Statistically Robust WDRO. We establish a robust\ngeneralization bound for the new optimization framework, implying that\nout-of-distribution adversarial performance is at least as good as the\nstatistically robust training loss with high probability. Furthermore, we\nderive conditions under which Stackelberg and Nash equilibria exist between the\nlearner and the adversary, giving an optimal robust model in certain sense.\nFinally, through extensive experiments, we demonstrate that our method\nsignificantly mitigates robust overfitting and enhances robustness within the\nframework of WDRO.\n","authors":["Shuang Liu","Yihan Wang","Yifan Zhu","Yibo Miao","Xiao-Shan Gao"],"pdf_url":"https://arxiv.org/pdf/2503.04315v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13794v2","updated":"2025-03-06T10:49:24Z","published":"2025-01-23T16:13:08Z","title":"Unveiling the Power of Noise Priors: Enhancing Diffusion Models for\n Mobile Traffic Prediction","summary":" Accurate prediction of mobile traffic, \\textit{i.e.,} network traffic from\ncellular base stations, is crucial for optimizing network performance and\nsupporting urban development. However, the non-stationary nature of mobile\ntraffic, driven by human activity and environmental changes, leads to both\nregular patterns and abrupt variations. Diffusion models excel in capturing\nsuch complex temporal dynamics due to their ability to capture the inherent\nuncertainties. Most existing approaches prioritize designing novel denoising\nnetworks but often neglect the critical role of noise itself, potentially\nleading to sub-optimal performance. In this paper, we introduce a novel\nperspective by emphasizing the role of noise in the denoising process. Our\nanalysis reveals that noise fundamentally shapes mobile traffic predictions,\nexhibiting distinct and consistent patterns. We propose NPDiff, a framework\nthat decomposes noise into \\textit{prior} and \\textit{residual} components,\nwith the \\textit{prior} derived from data dynamics, enhancing the model's\nability to capture both regular and abrupt variations. NPDiff can seamlessly\nintegrate with various diffusion-based prediction models, delivering\npredictions that are effective, efficient, and robust. Extensive experiments\ndemonstrate that it achieves superior performance with an improvement over\n30\\%, offering a new perspective on leveraging diffusion models in this domain.\n","authors":["Zhi Sheng","Yuan Yuan","Jingtao Ding","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2501.13794v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18380v4","updated":"2025-03-06T10:25:17Z","published":"2024-06-26T14:21:21Z","title":"KAGNNs: Kolmogorov-Arnold Networks meet Graph Learning","summary":" In recent years, Graph Neural Networks (GNNs) have become the de facto tool\nfor learning node and graph representations. Most GNNs typically consist of a\nsequence of neighborhood aggregation (a.k.a., message-passing) layers, within\nwhich the representation of each node is updated based on those of its\nneighbors. The most expressive message-passing GNNs can be obtained through the\nuse of the sum aggregator and of MLPs for feature transformation, thanks to\ntheir universal approximation capabilities. However, the limitations of MLPs\nrecently motivated the introduction of another family of universal\napproximators, called Kolmogorov-Arnold Networks (KANs) which rely on a\ndifferent representation theorem. In this work, we compare the performance of\nKANs against that of MLPs on graph learning tasks. We implement three new\nKAN-based GNN layers, inspired respectively by the GCN, GAT and GIN layers. We\nevaluate two different implementations of KANs using two distinct base families\nof functions, namely B-splines and radial basis functions. We perform extensive\nexperiments on node classification, link prediction, graph classification and\ngraph regression datasets. Our results indicate that KANs are on-par with or\nbetter than MLPs on all tasks studied in this paper. We also show that the size\nand training speed of RBF-based KANs is only marginally higher than for MLPs,\nmaking them viable alternatives. Code available at\nhttps://github.com/RomanBresson/KAGNN.\n","authors":["Roman Bresson","Giannis Nikolentzos","George Panagopoulos","Michail Chatzianastasis","Jun Pang","Michalis Vazirgiannis"],"pdf_url":"https://arxiv.org/pdf/2406.18380v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.16935v2","updated":"2025-03-06T10:10:06Z","published":"2024-10-22T12:12:43Z","title":"Graph Neural Networks for Edge Signals: Orientation Equivariance and\n Invariance","summary":" Many applications in traffic, civil engineering, or electrical engineering\nrevolve around edge-level signals. Such signals can be categorized as\ninherently directed, for example, the water flow in a pipe network, and\nundirected, like the diameter of a pipe. Topological methods model edge signals\nwith inherent direction by representing them relative to a so-called\norientation assigned to each edge. These approaches can neither model\nundirected edge signals nor distinguish if an edge itself is directed or\nundirected. We address these shortcomings by (i) revising the notion of\norientation equivariance to enable edge direction-aware topological models,\n(ii) proposing orientation invariance as an additional requirement to describe\nsignals without inherent direction, and (iii) developing EIGN, an architecture\ncomposed of novel direction-aware edge-level graph shift operators, that\nprovably fulfills the aforementioned desiderata. It is the first\ngeneral-purpose topological GNN for edge-level signals that can model directed\nand undirected signals while distinguishing between directed and undirected\nedges. A comprehensive evaluation shows that EIGN outperforms prior work in\nedge-level tasks, for example, improving in RMSE on flow simulation tasks by up\nto 23.5%.\n","authors":["Dominik Fuchsgruber","Tim Poštuvan","Stephan Günnemann","Simon Geisler"],"pdf_url":"https://arxiv.org/pdf/2410.16935v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04283v1","updated":"2025-03-06T10:09:20Z","published":"2025-03-06T10:09:20Z","title":"Explainable AI in Time-Sensitive Scenarios: Prefetched Offline\n Explanation Model","summary":" As predictive machine learning models become increasingly adopted and\nadvanced, their role has evolved from merely predicting outcomes to actively\nshaping them. This evolution has underscored the importance of Trustworthy AI,\nhighlighting the necessity to extend our focus beyond mere accuracy and toward\na comprehensive understanding of these models' behaviors within the specific\ncontexts of their applications. To further progress in explainability, we\nintroduce Poem, Prefetched Offline Explanation Model, a model-agnostic, local\nexplainability algorithm for image data. The algorithm generates exemplars,\ncounterexemplars and saliency maps to provide quick and effective explanations\nsuitable for time-sensitive scenarios. Leveraging an existing local algorithm,\n\\poem{} infers factual and counterfactual rules from data to create\nillustrative examples and opposite scenarios with an enhanced stability by\ndesign. A novel mechanism then matches incoming test points with an explanation\nbase and produces diverse exemplars, informative saliency maps and believable\ncounterexemplars. Experimental results indicate that Poem outperforms its\npredecessor Abele in speed and ability to generate more nuanced and varied\nexemplars alongside more insightful saliency maps and valuable\ncounterexemplars.\n","authors":["Fabio Michele Russo","Carlo Metta","Anna Monreale","Salvatore Rinzivillo","Fabio Pinelli"],"pdf_url":"https://arxiv.org/pdf/2503.04283v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04280v1","updated":"2025-03-06T10:08:44Z","published":"2025-03-06T10:08:44Z","title":"Towards Autonomous Reinforcement Learning for Real-World Robotic\n Manipulation with Large Language Models","summary":" Recent advancements in Large Language Models (LLMs) and Visual Language\nModels (VLMs) have significantly impacted robotics, enabling high-level\nsemantic motion planning applications. Reinforcement Learning (RL), a\ncomplementary paradigm, enables agents to autonomously optimize complex\nbehaviors through interaction and reward signals. However, designing effective\nreward functions for RL remains challenging, especially in real-world tasks\nwhere sparse rewards are insufficient and dense rewards require elaborate\ndesign. In this work, we propose Autonomous Reinforcement learning for Complex\nHumanInformed Environments (ARCHIE), an unsupervised pipeline leveraging GPT-4,\na pre-trained LLM, to generate reward functions directly from natural language\ntask descriptions. The rewards are used to train RL agents in simulated\nenvironments, where we formalize the reward generation process to enhance\nfeasibility. Additionally, GPT-4 automates the coding of task success criteria,\ncreating a fully automated, one-shot procedure for translating human-readable\ntext into deployable robot skills. Our approach is validated through extensive\nsimulated experiments on single-arm and bi-manual manipulation tasks using an\nABB YuMi collaborative robot, highlighting its practicality and effectiveness.\nTasks are demonstrated on the real robot setup.\n","authors":["Niccolò Turcato","Matteo Iovino","Aris Synodinos","Alberto Dalla Libera","Ruggero Carli","Pietro Falco"],"pdf_url":"https://arxiv.org/pdf/2503.04280v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04278v1","updated":"2025-03-06T10:07:17Z","published":"2025-03-06T10:07:17Z","title":"A General Framework for Scalable UE-AP Association in User-Centric\n Cell-Free Massive MIMO based on Recurrent Neural Networks","summary":" This study addresses the challenge of access point (AP) and user equipment\n(UE) association in cell-free massive MIMO networks. It introduces a deep\nlearning algorithm leveraging Bidirectional Long Short-Term Memory cells and a\nhybrid probabilistic methodology for weight updating. This approach enhances\nscalability by adapting to variations in the number of UEs without requiring\nretraining. Additionally, the study presents a training methodology that\nimproves scalability not only with respect to the number of UEs but also to the\nnumber of APs. Furthermore, a variant of the proposed AP-UE algorithm ensures\nrobustness against pilot contamination effects, a critical issue arising from\npilot reuse in channel estimation. Extensive numerical results validate the\neffectiveness and adaptability of the proposed methods, demonstrating their\nsuperiority over widely used heuristic alternatives.\n","authors":["Giovanni Di Gennaro","Amedeo Buonanno","Gianmarco Romano","Stefano Buzzi","Francesco A. N. Palmieri"],"pdf_url":"https://arxiv.org/pdf/2503.04278v1.pdf","comment":"submitted to IEEE journal"},{"id":"http://arxiv.org/abs/2405.14736v2","updated":"2025-03-06T09:52:43Z","published":"2024-05-23T16:02:30Z","title":"GIFT: Unlocking Full Potential of Labels in Distilled Dataset at\n Near-zero Cost","summary":" Recent advancements in dataset distillation have demonstrated the significant\nbenefits of employing soft labels generated by pre-trained teacher models. In\nthis paper, we introduce a novel perspective by emphasizing the full\nutilization of labels. We first conduct a comprehensive comparison of various\nloss functions for soft label utilization in dataset distillation, revealing\nthat the model trained on the synthetic dataset exhibits high sensitivity to\nthe choice of loss function for soft label utilization. This finding highlights\nthe necessity of a universal loss function for training models on synthetic\ndatasets. Building on these insights, we introduce an extremely simple yet\nsurprisingly effective plug-and-play approach, GIFT, which encompasses soft\nlabel refinement and a cosine similarity-based loss function to efficiently\nleverage full label information. Extensive experiments indicate that GIFT\nconsistently enhances state-of-the-art dataset distillation methods across\nvarious dataset scales, without incurring additional computational costs.\nImportantly, GIFT significantly enhances cross-optimizer generalization, an\narea previously overlooked. For instance, on ImageNet-1K with IPC = 10, GIFT\nenhances the state-of-the-art method RDED by 30.8% in cross-optimizer\ngeneralization. Our code is available at https://github.com/LINs-lab/GIFT.\n","authors":["Xinyi Shang","Peng Sun","Tao Lin"],"pdf_url":"https://arxiv.org/pdf/2405.14736v2.pdf","comment":"https://github.com/LINs-lab/GIFT"},{"id":"http://arxiv.org/abs/2503.04266v1","updated":"2025-03-06T09:50:43Z","published":"2025-03-06T09:50:43Z","title":"Frequency Hopping Synchronization by Reinforcement Learning for\n Satellite Communication System","summary":" Satellite communication systems (SCSs) used for tactical purposes require\nrobust security and anti-jamming capabilities, making frequency hopping (FH) a\npowerful option. However, the current FH systems face challenges due to\nsignificant interference from other devices and the considerable path loss\ninherent in satellite communication. This misalignment leads to inefficient\nsynchronization, crucial for maintaining reliable communication. Traditional\nmethods, such as those employing long short-term memory (LSTM) networks, have\nmade improvements, but they still struggle in dynamic conditions of satellite\nenvironments. This paper presents a novel method for synchronizing FH signals\nin tactical SCSs by combining serial search and reinforcement learning to\nachieve coarse and fine acquisition, respectively. The mathematical analysis\nand simulation results demonstrate that the proposed method reduces the average\nnumber of hops required for synchronization by 58.17% and mean squared error\n(MSE) of the uplink hop timing estimation by 76.95%, as compared to the\nconventional serial search method. Comparing with the early late gate\nsynchronization method based on serial search and use of LSTM network, the\naverage number of hops for synchronization is reduced by 12.24% and the MSE by\n18.5%.\n","authors":["Inkyu Kim","Sangkeum Lee","Haechan Jeong","Sarvar Hussain Nengroo","Dongsoo Har"],"pdf_url":"https://arxiv.org/pdf/2503.04266v1.pdf","comment":"18pages, 5figures"},{"id":"http://arxiv.org/abs/2503.04263v1","updated":"2025-03-06T09:47:41Z","published":"2025-03-06T09:47:41Z","title":"Bi-Lipschitz Ansatz for Anti-Symmetric Functions","summary":" Motivated by applications for simulating quantum many body functions, we\npropose a new universal ansatz for approximating anti-symmetric functions. The\nmain advantage of this ansatz over previous alternatives is that it is\nbi-Lipschitz with respect to a naturally defined metric. As a result, we are\nable to obtain quantitative approximation results for approximation of\nLipschitz continuous antisymmetric functions. Moreover, we provide preliminary\nexperimental evidence to the improved performance of this ansatz for learning\nantisymmetric functions.\n","authors":["Nadav Dym","Jianfeng Lu","Matan Mizrachi"],"pdf_url":"https://arxiv.org/pdf/2503.04263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04256v1","updated":"2025-03-06T09:38:14Z","published":"2025-03-06T09:38:14Z","title":"Knowledge Retention for Continual Model-Based Reinforcement Learning","summary":" We propose DRAGO, a novel approach for continual model-based reinforcement\nlearning aimed at improving the incremental development of world models across\na sequence of tasks that differ in their reward functions but not the state\nspace or dynamics. DRAGO comprises two key components: Synthetic Experience\nRehearsal, which leverages generative models to create synthetic experiences\nfrom past tasks, allowing the agent to reinforce previously learned dynamics\nwithout storing data, and Regaining Memories Through Exploration, which\nintroduces an intrinsic reward mechanism to guide the agent toward revisiting\nrelevant states from prior tasks. Together, these components enable the agent\nto maintain a comprehensive and continually developing world model,\nfacilitating more effective learning and adaptation across diverse\nenvironments. Empirical evaluations demonstrate that DRAGO is able to preserve\nknowledge across tasks, achieving superior performance in various continual\nlearning scenarios.\n","authors":["Yixiang Sun","Haotian Fu","Michael Littman","George Konidaris"],"pdf_url":"https://arxiv.org/pdf/2503.04256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04252v1","updated":"2025-03-06T09:35:20Z","published":"2025-03-06T09:35:20Z","title":"RCRank: Multimodal Ranking of Root Causes of Slow Queries in Cloud\n Database Systems","summary":" With the continued migration of storage to cloud database systems,the impact\nof slow queries in such systems on services and user experience is increasing.\nRoot-cause diagnosis plays an indispensable role in facilitating slow-query\ndetection and revision. This paper proposes a method capable of both\nidentifying possible root cause types for slow queries and ranking these\naccording to their potential for accelerating slow queries. This enables\nprioritizing root causes with the highest impact, in turn improving slow-query\nrevision effectiveness. To enable more accurate and detailed diagnoses, we\npropose the multimodal Ranking for the Root Causes of slow queries (RCRank)\nframework, which formulates root cause analysis as a multimodal machine\nlearning problem and leverages multimodal information from query statements,\nexecution plans, execution logs, and key performance indicators. To obtain\nexpressive embeddings from its heterogeneous multimodal input, RCRank\nintegrates self-supervised pre-training that enhances cross-modal alignment and\ntask relevance. Next, the framework integrates root-cause-adaptive cross\nTransformers that enable adaptive fusion of multimodal features with varying\ncharacteristics. Finally, the framework offers a unified model that features an\nimpact-aware training objective for identifying and ranking root causes. We\nreport on experiments on real and synthetic datasets, finding that RCRank is\ncapable of consistently outperforming the state-of-the-art methods at root\ncause identification and ranking according to a range of metrics.\n","authors":["Biao Ouyang","Yingying Zhang","Hanyin Cheng","Yang Shu","Chenjuan Guo","Bin Yang","Qingsong Wen","Lunting Fan","Christian S. Jensen"],"pdf_url":"https://arxiv.org/pdf/2503.04252v1.pdf","comment":"Accepted by VLDB 2025"},{"id":"http://arxiv.org/abs/2403.15038v2","updated":"2025-03-06T09:32:52Z","published":"2024-03-22T08:42:41Z","title":"Estimation of multiple mean vectors in high dimension","summary":" We endeavour to estimate numerous multi-dimensional means of various\nprobability distributions on a common space based on independent samples. Our\napproach involves forming estimators through convex combinations of empirical\nmeans derived from these samples. We introduce two strategies to find\nappropriate data-dependent convex combination weights: a first one employing a\ntesting procedure to identify neighbouring means with low variance, which\nresults in a closed-form plug-in formula for the weights, and a second one\ndetermining weights via minimization of an upper confidence bound on the\nquadratic risk.Through theoretical analysis, we evaluate the improvement in\nquadratic risk offered by our methods compared to the empirical means. Our\nanalysis focuses on a dimensional asymptotics perspective, showing that our\nmethods asymptotically approach an oracle (minimax) improvement as the\neffective dimension of the data increases.We demonstrate the efficacy of our\nmethods in estimating multiple kernel mean embeddings through experiments on\nboth simulated and real-world datasets.\n","authors":["Gilles Blanchard","Jean-Baptiste Fermanian","Hannah Marienwald"],"pdf_url":"https://arxiv.org/pdf/2403.15038v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04249v1","updated":"2025-03-06T09:32:39Z","published":"2025-03-06T09:32:39Z","title":"How to Mitigate Overfitting in Weak-to-strong Generalization?","summary":" Aligning powerful AI models on tasks that surpass human evaluation\ncapabilities is the central problem of \\textbf{superalignment}. To address this\nproblem, weak-to-strong generalization aims to elicit the capabilities of\nstrong models through weak supervisors and ensure that the behavior of strong\nmodels aligns with the intentions of weak supervisors without unsafe behaviors\nsuch as deception. Although weak-to-strong generalization exhibiting certain\ngeneralization capabilities, strong models exhibit significant overfitting in\nweak-to-strong generalization: Due to the strong fit ability of strong models,\nerroneous labels from weak supervisors may lead to overfitting in strong\nmodels. In addition, simply filtering out incorrect labels may lead to a\ndegeneration in question quality, resulting in a weak generalization ability of\nstrong models on hard questions. To mitigate overfitting in weak-to-strong\ngeneralization, we propose a two-stage framework that simultaneously improves\nthe quality of supervision signals and the quality of input questions.\nExperimental results in three series of large language models and two\nmathematical benchmarks demonstrate that our framework significantly improves\nPGR compared to naive weak-to-strong generalization, even achieving up to 100\\%\nPGR on some models.\n","authors":["Junhao Shi","Qinyuan Cheng","Zhaoye Fei","Yining Zheng","Qipeng Guo","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2503.04249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.19085v2","updated":"2025-03-06T09:32:37Z","published":"2024-12-26T06:54:22Z","title":"Assessing Pre-Trained Models for Transfer Learning Through Distribution\n of Spectral Components","summary":" Pre-trained model assessment for transfer learning aims to identify the\noptimal candidate for the downstream tasks from a model hub, without the need\nof time-consuming fine-tuning. Existing advanced works mainly focus on\nanalyzing the intrinsic characteristics of the entire features extracted by\neach pre-trained model or how well such features fit the target labels. This\npaper proposes a novel perspective for pre-trained model assessment through the\nDistribution of Spectral Components (DISCO). Through singular value\ndecomposition of features extracted from pre-trained models, we investigate\ndifferent spectral components and observe that they possess distinct\ntransferability, contributing diversely to the fine-tuning performance.\nInspired by this, we propose an assessment method based on the distribution of\nspectral components which measures the proportions of their corresponding\nsingular values. Pre-trained models with features concentrating on more\ntransferable components are regarded as better choices for transfer learning.\nWe further leverage the labels of downstream data to better estimate the\ntransferability of each spectral component and derive the final assessment\ncriterion. Our proposed method is flexible and can be applied to both\nclassification and regression tasks. We conducted comprehensive experiments\nacross three benchmarks and two tasks including image classification and object\ndetection, demonstrating that our method achieves state-of-the-art performance\nin choosing proper pre-trained models from the model hub for transfer learning.\n","authors":["Tengxue Zhang","Yang Shu","Xinyang Chen","Yifei Long","Chenjuan Guo","Bin Yang"],"pdf_url":"https://arxiv.org/pdf/2412.19085v2.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2503.04242v1","updated":"2025-03-06T09:24:23Z","published":"2025-03-06T09:24:23Z","title":"Incorporating Surrogate Gradient Norm to Improve Offline Optimization\n Techniques","summary":" Offline optimization has recently emerged as an increasingly popular approach\nto mitigate the prohibitively expensive cost of online experimentation. The key\nidea is to learn a surrogate of the black-box function that underlines the\ntarget experiment using a static (offline) dataset of its previous input-output\nqueries. Such an approach is, however, fraught with an out-of-distribution\nissue where the learned surrogate becomes inaccurate outside the offline data\nregimes. To mitigate this, existing offline optimizers have proposed numerous\nconditioning techniques to prevent the learned surrogate from being too\nerratic. Nonetheless, such conditioning strategies are often specific to\nparticular surrogate or search models, which might not generalize to a\ndifferent model choice. This motivates us to develop a model-agnostic approach\ninstead, which incorporates a notion of model sharpness into the training loss\nof the surrogate as a regularizer. Our approach is supported by a new\ntheoretical analysis demonstrating that reducing surrogate sharpness on the\noffline dataset provably reduces its generalized sharpness on unseen data. Our\nanalysis extends existing theories from bounding generalized prediction loss\n(on unseen data) with loss sharpness to bounding the worst-case generalized\nsurrogate sharpness with its empirical estimate on training data, providing a\nnew perspective on sharpness regularization. Our extensive experimentation on a\ndiverse range of optimization tasks also shows that reducing surrogate\nsharpness often leads to significant improvement, marking (up to) a noticeable\n9.6% performance boost. Our code is publicly available at\nhttps://github.com/cuong-dm/IGNITE\n","authors":["Manh Cuong Dao","Phi Le Nguyen","Thao Nguyen Truong","Trong Nghia Hoang"],"pdf_url":"https://arxiv.org/pdf/2503.04242v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.19102v2","updated":"2025-03-06T09:23:22Z","published":"2025-01-31T12:51:55Z","title":"Reinforcement Learning on Reconfigurable Hardware: Overcoming Material\n Variability in Laser Material Processing","summary":" Ensuring consistent processing quality is challenging in laser processes due\nto varying material properties and surface conditions. Although some approaches\nhave shown promise in solving this problem via automation, they often rely on\npredetermined targets or are limited to simulated environments. To address\nthese shortcomings, we propose a novel real-time reinforcement learning\napproach for laser process control, implemented on a Field Programmable Gate\nArray to achieve real-time execution. Our experimental results from laser\nwelding tests on stainless steel samples with a range of surface roughnesses\nvalidated the method's ability to adapt autonomously, without relying on reward\nengineering or prior setup information. Specifically, the algorithm learned the\ncorrect power profile for each unique surface characteristic, demonstrating\nsignificant improvements over hand-engineered optimal constant power strategies\n-- up to 23% better performance on rougher surfaces and 7% on mixed surfaces.\nThis approach represents a significant advancement in automating and optimizing\nlaser processes, with potential applications across multiple industries.\n","authors":["Giulio Masinelli","Chang Rajani","Patrik Hoffmann","Kilian Wasmer","David Atienza"],"pdf_url":"https://arxiv.org/pdf/2501.19102v2.pdf","comment":"Accepted for the 2025 IEEE International Conference on Robotics and\n Automation (ICRA), May 19-23, 2025, Atlanta, USA; Camera ready version --\n addressed reviewer comments in text, improved plot clarity"},{"id":"http://arxiv.org/abs/2501.13430v2","updated":"2025-03-06T09:22:38Z","published":"2025-01-23T07:29:44Z","title":"Wasserstein-regularized Conformal Prediction under General Distribution\n Shift","summary":" Conformal prediction yields a prediction set with guaranteed $1-\\alpha$\ncoverage of the true target under the i.i.d. assumption, which may not hold and\nlead to a gap between $1-\\alpha$ and the actual coverage. Prior studies bound\nthe gap using total variation distance, which cannot identify the gap changes\nunder distribution shift at a given $\\alpha$. Besides, existing methods are\nmostly limited to covariate shift,while general joint distribution shifts are\nmore common in practice but less researched.In response, we first propose a\nWasserstein distance-based upper bound of the coverage gap and analyze the\nbound using probability measure pushforwards between the shifted joint data and\nconformal score distributions, enabling a separation of the effect of covariate\nand concept shifts over the coverage gap. We exploit the separation to design\nan algorithm based on importance weighting and regularized representation\nlearning (WR-CP) to reduce the Wasserstein bound with a finite-sample error\nbound.WR-CP achieves a controllable balance between conformal prediction\naccuracy and efficiency. Experiments on six datasets prove that WR-CP can\nreduce coverage gaps to $3.2\\%$ across different confidence levels and outputs\nprediction sets 37$\\%$ smaller than the worst-case approach on average.\n","authors":["Rui Xu","Chao Chen","Yue Sun","Parvathinathan Venkitasubramaniam","Sihong Xie"],"pdf_url":"https://arxiv.org/pdf/2501.13430v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04241v1","updated":"2025-03-06T09:22:23Z","published":"2025-03-06T09:22:23Z","title":"ThrowBench: Benchmarking LLMs by Predicting Runtime Exceptions","summary":" Modern Large Language Models (LLMs) have shown astounding capabilities of\ncode understanding and synthesis. In order to assess such capabilities, several\nbenchmarks have been devised (e.g., HumanEval). However, most benchmarks focus\non code synthesis from natural language instructions. Hence, such benchmarks do\nnot test for other forms of code understanding. Moreover, there have been\nconcerns about contamination and leakage. That is, benchmark problems (or\nclosely related problems) may appear in training set, strongly biasing\nbenchmark results. In this work we investigate whether large language models\ncan correctly predict runtime program behavior. To this end, we introduce\nThrowBench, a benchmark consisting of over 2,400 short user-written programs\nwritten in four different programming languages. The majority of these programs\nthrow an exception during runtime (due to a bug). LLMs are asked to predict\nwhether a presented program throws an exception and, if so, which one.\nEvaluating our benchmark on six state-of-the-art code LLMs we see modest\nperformance ranging from 19 to 38% (F1 score). Benchmarking a wider set of code\ncapabilities could improve the assessment of code LLMs and help identify weak\npoints in current models. Moreover, as ground-truth answers have been\ndetermined through program execution, leakage is not a concern. We release\nThrowBench as well as all of our results together with this work.\n","authors":["Julian Aron Prenner","Romain Robbes"],"pdf_url":"https://arxiv.org/pdf/2503.04241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04231v1","updated":"2025-03-06T09:12:43Z","published":"2025-03-06T09:12:43Z","title":"One-Shot Clustering for Federated Learning","summary":" Federated Learning (FL) is a widespread and well adopted paradigm of\ndecentralized learning that allows training one model from multiple sources\nwithout the need to directly transfer data between participating clients. Since\nits inception in 2015, it has been divided into numerous sub-fields that deal\nwith application-specific issues, be it data heterogeneity or resource\nallocation. One such sub-field, Clustered Federated Learning (CFL), is dealing\nwith the problem of clustering the population of clients into separate cohorts\nto deliver personalized models. Although few remarkable works have been\npublished in this domain, the problem is still largely unexplored, as its basic\nassumption and settings are slightly different from standard FL. In this work,\nwe present One-Shot Clustered Federated Learning (OCFL), a clustering-agnostic\nalgorithm that can automatically detect the earliest suitable moment for\nclustering. Our algorithm is based on the computation of cosine similarity\nbetween gradients of the clients and a temperature measure that detects when\nthe federated model starts to converge. We empirically evaluate our methodology\nby testing various one-shot clustering algorithms for over thirty different\ntasks on three benchmark datasets. Our experiments showcase the good\nperformance of our approach when used to perform CFL in an automated manner\nwithout the need to adjust hyperparameters.\n","authors":["Maciej Krzysztof Zuziak","Roberto Pellungrini","Salvatore Rinzivillo"],"pdf_url":"https://arxiv.org/pdf/2503.04231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.02796v3","updated":"2025-03-06T09:10:18Z","published":"2022-06-06T14:26:34Z","title":"Mixed Graph Contrastive Network for Semi-Supervised Node Classification","summary":" Graph Neural Networks (GNNs) have achieved promising performance in\nsemi-supervised node classification in recent years. However, the problem of\ninsufficient supervision, together with representation collapse, largely limits\nthe performance of the GNNs in this field. To alleviate the collapse of node\nrepresentations in semi-supervised scenario, we propose a novel graph\ncontrastive learning method, termed Mixed Graph Contrastive Network (MGCN). In\nour method, we improve the discriminative capability of the latent embeddings\nby an interpolation-based augmentation strategy and a correlation reduction\nmechanism. Specifically, we first conduct the interpolation-based augmentation\nin the latent space and then force the prediction model to change linearly\nbetween samples. Second, we enable the learned network to tell apart samples\nacross two interpolation-perturbed views through forcing the correlation matrix\nacross views to approximate an identity matrix. By combining the two settings,\nwe extract rich supervision information from both the abundant unlabeled nodes\nand the rare yet valuable labeled nodes for discriminative representation\nlearning. Extensive experimental results on six datasets demonstrate the\neffectiveness and the generality of MGCN compared to the existing\nstate-of-the-art methods. The code of MGCN is available at\nhttps://github.com/xihongyang1999/MGCN on Github.\n","authors":["Xihong Yang","Yiqi Wang","Yue Liu","Yi Wen","Lingyuan Meng","Sihang Zhou","Xinwang Liu","En Zhu"],"pdf_url":"https://arxiv.org/pdf/2206.02796v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09263v3","updated":"2025-03-06T09:10:07Z","published":"2024-11-14T08:02:14Z","title":"Rethinking Weight-Averaged Model-merging","summary":" Model-merging has emerged as a powerful approach in deep learning, capable of\nenhancing model performance without any training. However, the underlying\nmechanisms that explain its effectiveness remain largely unexplored. In this\npaper, we investigate this technique from three novel perspectives to\nempirically provide deeper insights into why and how weight-averaged\nmodel-merging works: (1) we examine the intrinsic patterns captured by the\nlearning of the model weights, through the visualizations of their patterns on\nseveral datasets, showing that these weights often encode structured and\ninterpretable patterns and that is the essential why model-merging can work;\n(2) we mathematically and empirically investigate model ensemble merging\nstrategies based on averaging on weights versus averaging on features,\nproviding detailed analyses across diverse architectures and datasets; and (3)\nwe explore the impact on model-merging prediction stability in terms of\nchanging the parameter magnitude, revealing insights into the way of weight\naveraging works as regularization by showing the robustness across different\nparameter scales. Our findings shed light on the \"black box\" of weight-averaged\nmodel-merging, offering valuable insights and practical recommendations that\nadvance the model-merging process. The code is available at\nhttps://github.com/billhhh/Rethink-Merge.\n","authors":["Hu Wang","Congbo Ma","Ibrahim Almakky","Ian Reid","Gustavo Carneiro","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2411.09263v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04229v1","updated":"2025-03-06T09:09:18Z","published":"2025-03-06T09:09:18Z","title":"Synthetic Data is an Elegant GIFT for Continual Vision-Language Models","summary":" Pre-trained Vision-Language Models (VLMs) require Continual Learning (CL) to\nefficiently update their knowledge and adapt to various downstream tasks\nwithout retraining from scratch. However, for VLMs, in addition to the loss of\nknowledge previously learned from downstream tasks, pre-training knowledge is\nalso corrupted during continual fine-tuning. This issue is exacerbated by the\nunavailability of original pre-training data, leaving VLM's generalization\nability degrading. In this paper, we propose GIFT, a novel continual\nfine-tuning approach that utilizes synthetic data to overcome catastrophic\nforgetting in VLMs. Taking advantage of recent advances in text-to-image\nsynthesis, we employ a pre-trained diffusion model to recreate both\npre-training and learned downstream task data. In this way, the VLM can revisit\nprevious knowledge through distillation on matching diffusion-generated images\nand corresponding text prompts. Leveraging the broad distribution and high\nalignment between synthetic image-text pairs in VLM's feature space, we propose\na contrastive distillation loss along with an image-text alignment constraint.\nTo further combat in-distribution overfitting and enhance distillation\nperformance with limited amount of generated data, we incorporate adaptive\nweight consolidation, utilizing Fisher information from these synthetic\nimage-text pairs and achieving a better stability-plasticity balance. Extensive\nexperiments demonstrate that our method consistently outperforms previous\nstate-of-the-art approaches across various settings.\n","authors":["Bin Wu","Wuxuan Shi","Jinqiao Wang","Mang Ye"],"pdf_url":"https://arxiv.org/pdf/2503.04229v1.pdf","comment":"This work is accepted by CVPR 2025. Modifications may be performed"},{"id":"http://arxiv.org/abs/2402.02998v2","updated":"2025-03-06T08:57:29Z","published":"2024-02-05T13:37:00Z","title":"Careful with that Scalpel: Improving Gradient Surgery with an EMA","summary":" Beyond minimizing a single training loss, many deep learning estimation\npipelines rely on an auxiliary objective to quantify and encourage desirable\nproperties of the model (e.g. performance on another dataset, robustness,\nagreement with a prior). Although the simplest approach to incorporating an\nauxiliary loss is to sum it with the training loss as a regularizer, recent\nworks have shown that one can improve performance by blending the gradients\nbeyond a simple sum; this is known as gradient surgery. We cast the problem as\na constrained minimization problem where the auxiliary objective is minimized\namong the set of minimizers of the training loss. To solve this bilevel\nproblem, we follow a parameter update direction that combines the training loss\ngradient and the orthogonal projection of the auxiliary gradient to the\ntraining gradient. In a setting where gradients come from mini-batches, we\nexplain how, using a moving average of the training loss gradients, we can\ncarefully maintain this critical orthogonality property. We demonstrate that\nour method, Bloop, can lead to much better performances on NLP and vision\nexperiments than other gradient surgery methods without EMA.\n","authors":["Yu-Guan Hsieh","James Thornton","Eugene Ndiaye","Michal Klein","Marco Cuturi","Pierre Ablin"],"pdf_url":"https://arxiv.org/pdf/2402.02998v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04219v1","updated":"2025-03-06T08:54:31Z","published":"2025-03-06T08:54:31Z","title":"Quantum-Inspired Reinforcement Learning in the Presence of Epistemic\n Ambivalence","summary":" The complexity of online decision-making under uncertainty stems from the\nrequirement of finding a balance between exploiting known strategies and\nexploring new possibilities. Naturally, the uncertainty type plays a crucial\nrole in developing decision-making strategies that manage complexity\neffectively. In this paper, we focus on a specific form of uncertainty known as\nepistemic ambivalence (EA), which emerges from conflicting pieces of evidence\nor contradictory experiences. It creates a delicate interplay between\nuncertainty and confidence, distinguishing it from epistemic uncertainty that\ntypically diminishes with new information. Indeed, ambivalence can persist even\nafter additional knowledge is acquired. To address this phenomenon, we propose\na novel framework, called the epistemically ambivalent Markov decision process\n(EA-MDP), aiming to understand and control EA in decision-making processes.\nThis framework incorporates the concept of a quantum state from the quantum\nmechanics formalism, and its core is to assess the probability and reward of\nevery possible outcome. We calculate the reward function using quantum\nmeasurement techniques and prove the existence of an optimal policy and an\noptimal value function in the EA-MDP framework. We also propose the\nEA-epsilon-greedy Q-learning algorithm. To evaluate the impact of EA on\ndecision-making and the expedience of our framework, we study two distinct\nexperimental setups, namely the two-state problem and the lattice problem. Our\nresults show that using our methods, the agent converges to the optimal policy\nin the presence of EA.\n","authors":["Alireza Habibi","Saeed Ghoorchian","Setareh Maghsudi"],"pdf_url":"https://arxiv.org/pdf/2503.04219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.02495v2","updated":"2025-03-06T08:51:47Z","published":"2025-03-04T11:01:25Z","title":"Union of Experts: Adapting Hierarchical Routing to Equivalently\n Decomposed Transformer","summary":" We propose Union-of-Experts (UoE), which decomposes transformer into an\nequitant group of experts, and then implement selective routing on input data\nand experts. Our approach advances MoE design with four key innovations: (1) We\nconducted equitant expert decomposition on both MLP blocks and attention blocks\nbased on matrix partition in tensor parallelism. (2) We developed two routing\nparadigms: patch-wise data selection and expert selection, to apply routing\nacross different levels. (3) We design the architecture of UoE model, including\nSelective Multi-Head Attention (SMHA) and Union-of-MLP-Experts (UoME). (4) We\ndevelop parallel implementation of UoE's routing and computation operation, and\noptimize efficiency based on the hardware processing analysis. The experiments\ndemonstrate that the UoE model surpass Full Attention, state-of-art MoEs and\nefficient transformers (including the model architecture of recently proposed\nDeepSeek-V3) in several tasks across image and natural language domains. In\nlanguage modeling tasks, we achieve an average reduction of 2.38 in perplexity\ncompared to the best-performed MoE method with an average of 76% FLOPs. In Long\nRange Arena benchmark, we recorded an average score that is at least 0.68%\nhigher than all comparison models including Full Attention, MoEs, and\ntransformer variants, with only 50% FLOPs of the best MoE method. In image\nclassification, our model yielded an average accuracy improvement of 1.75% than\nthe best model while maintaining comparable FLOPs. The source codes are\navailable at https://github.com/YujiaoYang-work/UoE.\n","authors":["Yujiao Yang","Jing Lian","Linhui Li"],"pdf_url":"https://arxiv.org/pdf/2503.02495v2.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2503.01224v2","updated":"2025-03-06T08:51:38Z","published":"2025-03-03T06:43:45Z","title":"CE-U: Cross Entropy Unlearning","summary":" Large language models (LLMs) inadvertently memorize sensitive data from their\nmassive pretraining corpora \\cite{jang2022knowledge}. In this work, we propose\nCE-U (Cross Entropy Unlearning), a novel loss function designed specifically\nfor unlearning tasks. CE-U addresses fundamental limitations of gradient ascent\napproaches which suffer from instability due to vanishing gradients when model\nconfidence is high and gradient exploding when confidence is low. We also unify\nstandard cross entropy supervision and cross entropy unlearning into a single\nframework. Notably, on the TOFU benchmark for unlearning \\cite{maini2024tofu},\nCE-U achieves state-of-the-art results on LLaMA2-7B with 1\\% and 5\\%\nforgetting, even without the use of any extra reference model or additional\npositive samples. Our theoretical analysis further reveals that the gradient\ninstability issues also exist in popular reinforcement learning algorithms like\nDPO \\cite{rafailov2023direct} and GRPO\\cite{Shao2024DeepSeekMath}, as they\ninclude a gradient ascent component. This suggests that applying CE-U\nprinciples to reinforcement learning could be a promising direction for\nimproving stability and convergence.\n","authors":["Bo Yang"],"pdf_url":"https://arxiv.org/pdf/2503.01224v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2503.04204v1","updated":"2025-03-06T08:30:18Z","published":"2025-03-06T08:30:18Z","title":"FUSE: First-Order and Second-Order Unified SynthEsis in Stochastic\n Optimization","summary":" Stochastic optimization methods have actively been playing a critical role in\nmodern machine learning algorithms to deliver decent performance. While\nnumerous works have proposed and developed diverse approaches, first-order and\nsecond-order methods are in entirely different situations. The former is\nsignificantly pivotal and dominating in emerging deep learning but only leads\nconvergence to a stationary point. However, second-order methods are less\npopular due to their computational intensity in large-dimensional problems.\nThis paper presents a novel method that leverages both the first-order and\nsecond-order methods in a unified algorithmic framework, termed FUSE, from\nwhich a practical version (PV) is derived accordingly. FUSE-PV stands as a\nsimple yet efficient optimization method involving a switch-over between first\nand second orders. Additionally, we develop different criteria that determine\nwhen to switch. FUSE-PV has provably shown a smaller computational complexity\nthan SGD and Adam. To validate our proposed scheme, we present an ablation\nstudy on several simple test functions and show a comparison with baselines for\nbenchmark datasets.\n","authors":["Zhanhong Jiang","Md Zahid Hasan","Aditya Balu","Joshua R. Waite","Genyi Huang","Soumik Sarkar"],"pdf_url":"https://arxiv.org/pdf/2503.04204v1.pdf","comment":"6 pages, 7 figures"},{"id":"http://arxiv.org/abs/2503.04203v1","updated":"2025-03-06T08:29:36Z","published":"2025-03-06T08:29:36Z","title":"Geometric Re-Analysis of Classical MDP Solving Algorithms","summary":" We build on a recently introduced geometric interpretation of Markov Decision\nProcesses (MDPs) to analyze classical MDP-solving algorithms: Value Iteration\n(VI) and Policy Iteration (PI). First, we develop a geometry-based analytical\napparatus, including a transformation that modifies the discount factor\n$\\gamma$, to improve convergence guarantees for these algorithms in several\nsettings. In particular, one of our results identifies a rotation component in\nthe VI method, and as a consequence shows that when a Markov Reward Process\n(MRP) induced by the optimal policy is irreducible and aperiodic, the\nasymptotic convergence rate of value iteration is strictly smaller than\n$\\gamma$.\n","authors":["Arsenii Mustafin","Aleksei Pakharev","Alex Olshevsky","Ioannis Ch. Paschalidis"],"pdf_url":"https://arxiv.org/pdf/2503.04203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2102.01130v3","updated":"2025-03-06T14:21:55Z","published":"2021-02-01T19:45:47Z","title":"Comparing hundreds of machine learning classifiers and discrete choice\n models in predicting travel behavior: an empirical benchmark","summary":" Numerous studies have compared machine learning (ML) and discrete choice\nmodels (DCMs) in predicting travel demand. However, these studies often lack\ngeneralizability as they compare models deterministically without considering\ncontextual variations. To address this limitation, our study develops an\nempirical benchmark by designing a tournament model, thus efficiently\nsummarizing a large number of experiments, quantifying the randomness in model\ncomparisons, and using formal statistical tests to differentiate between the\nmodel and contextual effects. This benchmark study compares two large-scale\ndata sources: a database compiled from literature review summarizing 136\nexperiments from 35 studies, and our own experiment data, encompassing a total\nof 6,970 experiments from 105 models and 12 model families. This benchmark\nstudy yields two key findings. Firstly, many ML models, particularly the\nensemble methods and deep learning, statistically outperform the DCM family\n(i.e., multinomial, nested, and mixed logit models). However, this study also\nhighlights the crucial role of the contextual factors (i.e., data sources,\ninputs and choice categories), which can explain models' predictive performance\nmore effectively than the differences in model types alone. Model performance\nvaries significantly with data sources, improving with larger sample sizes and\nlower dimensional alternative sets. After controlling all the model and\ncontextual factors, significant randomness still remains, implying inherent\nuncertainty in such model comparisons. Overall, we suggest that future\nresearchers shift more focus from context-specific model comparisons towards\nexamining model transferability across contexts and characterizing the inherent\nuncertainty in ML, thus creating more robust and generalizable next-generation\ntravel demand models.\n","authors":["Shenhao Wang","Baichuan Mo","Yunhan Zheng","Stephane Hess","Jinhua Zhao"],"pdf_url":"https://arxiv.org/pdf/2102.01130v3.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2503.04446v1","updated":"2025-03-06T14:02:01Z","published":"2025-03-06T14:02:01Z","title":"SMTPD: A New Benchmark for Temporal Prediction of Social Media\n Popularity","summary":" Social media popularity prediction task aims to predict the popularity of\nposts on social media platforms, which has a positive driving effect on\napplication scenarios such as content optimization, digital marketing and\nonline advertising. Though many studies have made significant progress, few of\nthem pay much attention to the integration between popularity prediction with\ntemporal alignment. In this paper, with exploring YouTube's multilingual and\nmulti-modal content, we construct a new social media temporal popularity\nprediction benchmark, namely SMTPD, and suggest a baseline framework for\ntemporal popularity prediction. Through data analysis and experiments, we\nverify that temporal alignment and early popularity play crucial roles in\nsocial media popularity prediction for not only deepening the understanding of\ntemporal dynamics of popularity in social media but also offering a suggestion\nabout developing more effective prediction models in this field. Code is\navailable at https://github.com/zhuwei321/SMTPD.\n","authors":["Yijie Xu","Bolun Zheng","Wei Zhu","Hangjia Pan","Yuchen Yao","Ning Xu","Anan Liu","Quan Zhang","Chenggang Yan"],"pdf_url":"https://arxiv.org/pdf/2503.04446v1.pdf","comment":"accept by CVPR 2025"}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000..7f5166c Binary files /dev/null and b/favicon.ico differ diff --git a/index.css b/index.css new file mode 100644 index 0000000..9ded9d9 --- /dev/null +++ b/index.css @@ -0,0 +1,355 @@ +:root { + /* Palette: Nord (https://www.nordtheme.com)*/ + --nord00: #2e3440; + --nord01: #3b4252; + --nord02: #434c5e; + --nord03: #4c566a; + --nord04: #d8dee9; + --nord05: #e5e9f0; + --nord06: #eceff4; + --nord07: #8fbcbb; + --nord08: #88c0d0; + --nord09: #81a1c1; + --nord0A: #5e81ac; + --nord0B: #bf616a; + --nord0C: #d08770; + --nord0D: #ebcb8b; + --nord0E: #a3be8c; + --nord0F: #b48ead; + + + /* Typograph */ + --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", + sans-serif; + --font-size-scaler: 62.5%; + --font-size-m: 1.6rem; + --font-size-s: 1.4rem; + + /* Components */ + --body-color: var(--nord06); + --body-bg: var(--nord00); + + --header-title: var(--nord06); + --header-container: var(--nord00); + --header-title-preffix: var(--nord0F); + + --chip-font: var(--nord08); + --chip-color: var(--nord0B); + + --icons: var(--nord06); + --icons-hover: var(--nord0F); + + --day-container: var(--nord01); + --date: var(--nord09); + + --summary: var(--nord0E); + --summary-hover: var(--nord0F); + + --details-open: var(--nord02); + --details-content: var(--nord05); + --details-a: var(--nord07); + --details-a-hover: var(--nord0F); + + --highlight-title: var(--nord0B); + --highlight-author: var(--nord0B); + + --article-summary-hover-color: var(--nord0D); + --article-summary-color: var(--nord04); + + --article-title-color: var(--nord05); + --article-title-hover-color: var(--nord0E); + + --accordion-content-rail-color: var(--nord01); + --accordion-content-hover-rail-color: var(--nord0D); + --accordion-title-marker-color: var(--nord01); + --accordion-title-hover-marker-color: var(--nord0E); + + --footer-color: var(--nord04); + --footer-link-hover-color: var(--nord0D); +} + +[data-theme="light"] { + /* Theme design */ + + --color-primary: var(--nord07); + --color-primary-second: var(--nord00); + --color-info: var(--nord0A); + --color-success: var(--nord0E); + --color-warning: var(--nord0C); + --color-danger: var(--nord0B); + + --color-text: var(--nord00); + --color-hover: var(--nord0D); + --color-shadow: var(--nord03); + + --color-primary-h: var(--nord09); + --color-primary-s: var(--nord08); + --color-primary-l: var(--nord07); + + --color-contrast-higher-h: var(--nord01); + --color-contrast-higher-l: var(--nord02); + --color-contrast-higher-s: var(--nord03); + + --color-content: white; + + --background: var(--nord06); + --background-content: var(--nord05); + --background-color: var(--nord04); + + /* Components */ + + --chip-font: var(--nord06); + --chip-color: var(--nord09); + + --body-color: var(--background-color); + --body-bg: var(--background); + + --header-title: var(--color-shadow); + --header-container: var(--background); + --header-title-preffix: var(--color-primary-h); + + --icons: var(--color-shadow); + --icons-hover: var(--color-hover); + + --day-container: var(--background-content); + --date: var(--color-primary-l); + + --summary: var(--color-info); + --summary-hover: var(--color-success); + + --details-open: var(--color-content); + --details-content: var(--color-text); + --details-a: var(--color-primary-h); + --details-a-hover: var(--color-hover); + + --highlight-title: var(--color-danger); + --highlight-author: var(--color-warning); + + --article-summary-color: var(--color-text); + --article-summary-hover-color: var(--color-primary-s); + + --article-title-color: var(--color-primary); + --article-title-hover-color: var(--color-success); + + --accordion-content-rail-color: var(--color-warning); + --accordion-content-hover-rail-color: var(--color-warning); + --accordion-title-marker-color: var(--color-success); + --accordion-title-hover-marker-color: var(--color-success); + + --footer-color: var(--color-text); + --footer-link-hover-color: var(--color-hover); +} + +html { + font-size: var(--font-size-scaler); +} + +body { + background-color: var(--body-bg); + font-family: var(--font-family-default); + color: var(--body-color); + margin: 0; + padding-top: 16px; + display: grid; +} + +.header-container { + width: 90%; + max-width: 1200px; + background: var(--header-container); + margin: 0 auto; +} + +.header-title { + font-size: 32px; + font-weight: bold; + color: var(--header-title); + margin: 0; + padding-bottom: 14px; +} + +.header-title-preffix { + color: var(--header-title-preffix); +} + +.icons { + color: var(--icons); + padding-bottom: 16px; +} + +.icons a { + color: var(--icons); + text-decoration: none; +} + +.icons a:hover { + color: var(--icons-hover); +} + +.day-container { + padding: 16px 16px 16px 16px; + background: var(--day-container); + width: 90%; + max-width: 1200px; + margin: 0 auto; + margin-bottom: 8px; + border-radius: 10px; +} + +.date { + font-size: 24px; + font-weight: 700; + margin: 0; + color: var(--date); +} + +p { + margin: 0; +} + +summary { + font-weight: 600; + color: var(--summary); +} + +summary:hover { + text-decoration: underline; + cursor: pointer; + color: var(--summary-hover); +} + +details { + --border-color: transparent; + + padding: 2px 4px; + font-size: 20px; + border: 1px solid var(--border-color); + border-radius: 4px; +} + +details[open] { + background-color: var(--details-open); + margin-bottom: 8px; +} + +.details-content { + padding: 12px 3px; + gap: 16px; + color: var(--details-content); +} + +details a { + color: var(--details-a); +} + +details a:hover { + color: var(--details-a-hover); +} + +footer { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + justify-content: space-between; +} + +.description { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + text-align: center; +} + +.highlight-author { + color: var(--highlight-author); + font-weight: bold; +} + +.highlight-title { + color: var(--highlight-title); + font-weight: bold; +} + +.channel-description { + text-align: center; + font-size: var(--font-size-scaler); +} + +.article-summary-link { + color: var(--article-summary-color); + font-size: var(--font-size-s); + text-decoration: none; +} + +.article-summary-link:hover { + color: var(--article-summary-hover-color); + --accordion-content-rail-color: var(--accordion-content-hover-rail-color); +} + +.article-summary-box-outer { + display: block; + padding: 4px 8px 8px 4px; +} + +.article-summary-box-inner { + padding-left: 8px; + border-left: 1px solid var(--accordion-content-rail-color); + font-size: var(--font-size-m); +} + +.article-expander { + padding: 10px 4px; + border-radius: 4px; +} + +.article-authors { + font-size: var(--font-size-m); + padding: 0.25em 1em; +} + +.article-authors a { + text-decoration: none; +} + +.article-expander-title { + font-size: var(--font-size-m); + font-weight: 600; +} + +.article-expander-title:hover { + cursor: pointer; +} + +.article-expander-title::marker { + color: var(--accordion-title-marker-color); +} + +.article-expander-title:hover::marker { + color: var(--accordion-title-hover-marker-color); +} + +/* for switcher */ +.theme-switch { + display: inline-block; + position: relative; +} + +.theme-switch input { + display: none; +} + +/* chip */ +.chip { + font-size: 90%; + align-items: center; + color: var(--chip-font); + background: var(--chip-color); + border-radius: 5rem; + display: inline-flex; + padding: .2rem .4rem; + vertical-align: middle; +} \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 0000000..46a205a --- /dev/null +++ b/index.html @@ -0,0 +1,55625 @@ + + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 150 + +
+
+
+ + ☆ FluidNexus: 3D Fluid Reconstruction and Prediction from a Single Video CVPR 2025 + + +
+ We study reconstructing and predicting 3D fluid appearance and velocity from +a single video. Current methods require multi-view videos for fluid +reconstruction. We present FluidNexus, a novel framework that bridges video +generation and physics simulation to tackle this task. Our key insight is to +synthesize multiple novel-view videos as references for reconstruction. +FluidNexus consists of two key components: (1) a novel-view video synthesizer +that combines frame-wise view synthesis with video diffusion refinement for +generating realistic videos, and (2) a physics-integrated particle +representation coupling differentiable simulation and rendering to +simultaneously facilitate 3D fluid reconstruction and prediction. To evaluate +our approach, we collect two new real-world fluid datasets featuring textured +backgrounds and object interactions. Our method enables dynamic novel view +synthesis, future prediction, and interaction simulation from a single fluid +video. Project website: https://yuegao.me/FluidNexus. + +
+
+ comment: CVPR 2025. Project website: https://yuegao.me/FluidNexus +
+
+
+
+
+ + ☆ Floxels: Fast Unsupervised Voxel Based Scene Flow Estimation CVPR 2025 + + +
+ Scene flow estimation is a foundational task for many robotic applications, +including robust dynamic object detection, automatic labeling, and sensor +synchronization. Two types of approaches to the problem have evolved: 1) +Supervised and 2) optimization-based methods. Supervised methods are fast +during inference and achieve high-quality results, however, they are limited by +the need for large amounts of labeled training data and are susceptible to +domain gaps. In contrast, unsupervised test-time optimization methods do not +face the problem of domain gaps but usually suffer from substantial runtime, +exhibit artifacts, or fail to converge to the right solution. In this work, we +mitigate several limitations of existing optimization-based methods. To this +end, we 1) introduce a simple voxel grid-based model that improves over the +standard MLP-based formulation in multiple dimensions and 2) introduce a new +multiframe loss formulation. 3) We combine both contributions in our new +method, termed Floxels. On the Argoverse 2 benchmark, Floxels is surpassed only +by EulerFlow among unsupervised methods while achieving comparable performance +at a fraction of the computational cost. Floxels achieves a massive speedup of +more than ~60 - 140x over EulerFlow, reducing the runtime from a day to 10 +minutes per sequence. Over the faster but low-quality baseline, NSFP, Floxels +achieves a speedup of ~14x. + +
+
+ comment: Accepted at CVPR 2025 +
+
+
+
+
+ + ☆ Iris Style Transfer: Enhancing Iris Recognition with Style Features and + Privacy Preservation through Neural Style Transfer + + +
+ Iris texture is widely regarded as a gold standard biometric modality for +authentication and identification. The demand for robust iris recognition +methods, coupled with growing security and privacy concerns regarding iris +attacks, has escalated recently. Inspired by neural style transfer, an advanced +technique that leverages neural networks to separate content and style +features, we hypothesize that iris texture's style features provide a reliable +foundation for recognition and are more resilient to variations like rotation +and perspective shifts than traditional approaches. Our experimental results +support this hypothesis, showing a significantly higher classification accuracy +compared to conventional features. Further, we propose using neural style +transfer to mask identifiable iris style features, ensuring the protection of +sensitive biometric information while maintaining the utility of eye images for +tasks like eye segmentation and gaze estimation. This work opens new avenues +for iris-oriented, secure, and privacy-aware biometric systems. + +
+
+ comment: 14 pages main paper, 4 pages appendix +
+
+
+
+
+ + ☆ DEAL-YOLO: Drone-based Efficient Animal Localization using YOLO ICLR 2025 + + +
+ Although advances in deep learning and aerial surveillance technology are +improving wildlife conservation efforts, complex and erratic environmental +conditions still pose a problem, requiring innovative solutions for +cost-effective small animal detection. This work introduces DEAL-YOLO, a novel +approach that improves small object detection in Unmanned Aerial Vehicle (UAV) +images by using multi-objective loss functions like Wise IoU (WIoU) and +Normalized Wasserstein Distance (NWD), which prioritize pixels near the centre +of the bounding box, ensuring smoother localization and reducing abrupt +deviations. Additionally, the model is optimized through efficient feature +extraction with Linear Deformable (LD) convolutions, enhancing accuracy while +maintaining computational efficiency. The Scaled Sequence Feature Fusion (SSFF) +module enhances object detection by effectively capturing inter-scale +relationships, improving feature representation, and boosting metrics through +optimized multiscale fusion. Comparison with baseline models reveals high +efficacy with up to 69.5\% fewer parameters compared to vanilla Yolov8-N, +highlighting the robustness of the proposed modifications. Through this +approach, our paper aims to facilitate the detection of endangered species, +animal population analysis, habitat monitoring, biodiversity research, and +various other applications that enrich wildlife conservation efforts. DEAL-YOLO +employs a two-stage inference paradigm for object detection, refining selected +regions to improve localization and confidence. This approach enhances +performance, especially for small instances with low objectness scores. + +
+
+ comment: Accepted as a Poster at the ML4RS Workshop at ICLR 2025 +
+
+
+
+
+ + ☆ Teach YOLO to Remember: A Self-Distillation Approach for Continual + Object Detection + + +
+ Real-time object detectors like YOLO achieve exceptional performance when +trained on large datasets for multiple epochs. However, in real-world scenarios +where data arrives incrementally, neural networks suffer from catastrophic +forgetting, leading to a loss of previously learned knowledge. To address this, +prior research has explored strategies for Class Incremental Learning (CIL) in +Continual Learning for Object Detection (CLOD), with most approaches focusing +on two-stage object detectors. However, existing work suggests that Learning +without Forgetting (LwF) may be ineffective for one-stage anchor-free detectors +like YOLO due to noisy regression outputs, which risk transferring corrupted +knowledge. In this work, we introduce YOLO LwF, a self-distillation approach +tailored for YOLO-based continual object detection. We demonstrate that when +coupled with a replay memory, YOLO LwF significantly mitigates forgetting. +Compared to previous approaches, it achieves state-of-the-art performance, +improving mAP by +2.1% and +2.9% on the VOC and COCO benchmarks, respectively. + +
+
+
+
+
+ + ☆ What Are You Doing? A Closer Look at Controllable Human Video Generation + + +
+ High-quality benchmarks are crucial for driving progress in machine learning +research. However, despite the growing interest in video generation, there is +no comprehensive dataset to evaluate human generation. Humans can perform a +wide variety of actions and interactions, but existing datasets, like TikTok +and TED-Talks, lack the diversity and complexity to fully capture the +capabilities of video generation models. We close this gap by introducing `What +Are You Doing?' (WYD): a new benchmark for fine-grained evaluation of +controllable image-to-video generation of humans. WYD consists of 1{,}544 +captioned videos that have been meticulously collected and annotated with 56 +fine-grained categories. These allow us to systematically measure performance +across 9 aspects of human generation, including actions, interactions and +motion. We also propose and validate automatic metrics that leverage our +annotations and better capture human evaluations. Equipped with our dataset and +metrics, we perform in-depth analyses of seven state-of-the-art models in +controllable image-to-video generation, showing how WYD provides novel insights +about the capabilities of these models. We release our data and code to drive +forward progress in human video generation modeling at +https://github.com/google-deepmind/wyd-benchmark. + +
+
+
+
+
+ + ☆ Implicit Neural Representation for Video and Image Super-Resolution + + +
+ We present a novel approach for super-resolution that utilizes implicit +neural representation (INR) to effectively reconstruct and enhance +low-resolution videos and images. By leveraging the capacity of neural networks +to implicitly encode spatial and temporal features, our method facilitates +high-resolution reconstruction using only low-resolution inputs and a 3D +high-resolution grid. This results in an efficient solution for both image and +video super-resolution. Our proposed method, SR-INR, maintains consistent +details across frames and images, achieving impressive temporal stability +without relying on the computationally intensive optical flow or motion +estimation typically used in other video super-resolution techniques. The +simplicity of our approach contrasts with the complexity of many existing +methods, making it both effective and efficient. Experimental evaluations show +that SR-INR delivers results on par with or superior to state-of-the-art +super-resolution methods, while maintaining a more straightforward structure +and reduced computational demands. These findings highlight the potential of +implicit neural representations as a powerful tool for reconstructing +high-quality, temporally consistent video and image signals from low-resolution +data. + +
+
+
+
+
+ + ☆ RadIR: A Scalable Framework for Multi-Grained Medical Image Retrieval + via Radiology Report Mining + + +
+ Developing advanced medical imaging retrieval systems is challenging due to +the varying definitions of `similar images' across different medical contexts. +This challenge is compounded by the lack of large-scale, high-quality medical +imaging retrieval datasets and benchmarks. In this paper, we propose a novel +methodology that leverages dense radiology reports to define image-wise +similarity ordering at multiple granularities in a scalable and fully automatic +manner. Using this approach, we construct two comprehensive medical imaging +retrieval datasets: MIMIC-IR for Chest X-rays and CTRATE-IR for CT scans, +providing detailed image-image ranking annotations conditioned on diverse +anatomical structures. Furthermore, we develop two retrieval systems, RadIR-CXR +and model-ChestCT, which demonstrate superior performance in traditional +image-image and image-report retrieval tasks. These systems also enable +flexible, effective image retrieval conditioned on specific anatomical +structures described in text, achieving state-of-the-art results on 77 out of +78 metrics. + +
+
+
+
+
+ + ☆ Transferable Foundation Models for Geometric Tasks on Point Cloud + Representations: Geometric Neural Operators + + +
+ We introduce methods for obtaining pretrained Geometric Neural Operators +(GNPs) that can serve as basal foundation models for use in obtaining geometric +features. These can be used within data processing pipelines for machine +learning tasks and numerical methods. We show how our GNPs can be trained to +learn robust latent representations for the differential geometry of +point-clouds to provide estimates of metric, curvature, and other shape-related +features. We demonstrate how our pre-trained GNPs can be used (i) to estimate +the geometric properties of surfaces of arbitrary shape and topologies with +robustness in the presence of noise, (ii) to approximate solutions of geometric +partial differential equations (PDEs) on manifolds, and (iii) to solve +equations for shape deformations such as curvature driven flows. We also +release a package of the codes and weights for using our pre-trained GNPs for +processing point cloud representations. This allows for incorporating our +pre-trained GNPs as components for reuse within existing and new data +processing pipelines. The GNPs also can be used as part of numerical solvers +involving geometry or as part of methods for performing inference and other +geometric tasks. + +
+
+
+
+
+ + ☆ Adaptive Prototype Learning for Multimodal Cancer Survival Analysis + + +
+ Leveraging multimodal data, particularly the integration of whole-slide +histology images (WSIs) and transcriptomic profiles, holds great promise for +improving cancer survival prediction. However, excessive redundancy in +multimodal data can degrade model performance. In this paper, we propose +Adaptive Prototype Learning (APL), a novel and effective approach for +multimodal cancer survival analysis. APL adaptively learns representative +prototypes in a data-driven manner, reducing redundancy while preserving +critical information. Our method employs two sets of learnable query vectors +that serve as a bridge between high-dimensional representations and survival +prediction, capturing task-relevant features. Additionally, we introduce a +multimodal mixed self-attention mechanism to enable cross-modal interactions, +further enhancing information fusion. Extensive experiments on five benchmark +cancer datasets demonstrate the superiority of our approach over existing +methods. The code is available at https://github.com/HongLiuuuuu/APL. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + Simulating the Real World: A Unified Survey of Multimodal Generative + Models + + +
+ Understanding and replicating the real world is a critical challenge in +Artificial General Intelligence (AGI) research. To achieve this, many existing +approaches, such as world models, aim to capture the fundamental principles +governing the physical world, enabling more accurate simulations and meaningful +interactions. However, current methods often treat different modalities, +including 2D (images), videos, 3D, and 4D representations, as independent +domains, overlooking their interdependencies. Additionally, these methods +typically focus on isolated dimensions of reality without systematically +integrating their connections. In this survey, we present a unified survey for +multimodal generative models that investigate the progression of data +dimensionality in real-world simulation. Specifically, this survey starts from +2D generation (appearance), then moves to video (appearance+dynamics) and 3D +generation (appearance+geometry), and finally culminates in 4D generation that +integrate all dimensions. To the best of our knowledge, this is the first +attempt to systematically unify the study of 2D, video, 3D and 4D generation +within a single framework. To guide future research, we provide a comprehensive +review of datasets, evaluation metrics and future directions, and fostering +insights for newcomers. This survey serves as a bridge to advance the study of +multimodal generative models and real-world simulation within a unified +framework. + +
+
+ comment: Repository for the related papers at + https://github.com/ALEEEHU/World-Simulator +
+
+
+
+
+ + ☆ Enhancing SAM with Efficient Prompting and Preference Optimization for + Semi-supervised Medical Image Segmentation CVPR 2025 + + +
+ Foundational models such as the Segment Anything Model (SAM) are gaining +traction in medical imaging segmentation, supporting multiple downstream tasks. +However, such models are supervised in nature, still relying on large annotated +datasets or prompts supplied by experts. Conventional techniques such as active +learning to alleviate such limitations are limited in scope and still +necessitate continuous human involvement and complex domain knowledge for label +refinement or establishing reward ground truth. To address these challenges, we +propose an enhanced Segment Anything Model (SAM) framework that utilizes +annotation-efficient prompts generated in a fully unsupervised fashion, while +still capturing essential semantic, location, and shape information through +contrastive language-image pretraining and visual question answering. We adopt +the direct preference optimization technique to design an optimal policy that +enables the model to generate high-fidelity segmentations with simple ratings +or rankings provided by a virtual annotator simulating the human annotation +process. State-of-the-art performance of our framework in tasks such as lung +segmentation, breast tumor segmentation, and organ segmentation across various +modalities, including X-ray, ultrasound, and abdominal CT, justifies its +effectiveness in low-annotation data scenarios. + +
+
+ comment: Accepted to CVPR 2025 +
+
+
+
+
+ + ☆ 3HANDS Dataset: Learning from Humans for Generating Naturalistic + Handovers with Supernumerary Robotic Limbs + + +
+ Supernumerary robotic limbs (SRLs) are robotic structures integrated closely +with the user's body, which augment human physical capabilities and necessitate +seamless, naturalistic human-machine interaction. For effective assistance in +physical tasks, enabling SRLs to hand over objects to humans is crucial. Yet, +designing heuristic-based policies for robots is time-consuming, difficult to +generalize across tasks, and results in less human-like motion. When trained +with proper datasets, generative models are powerful alternatives for creating +naturalistic handover motions. We introduce 3HANDS, a novel dataset of object +handover interactions between a participant performing a daily activity and +another participant enacting a hip-mounted SRL in a naturalistic manner. 3HANDS +captures the unique characteristics of SRL interactions: operating in intimate +personal space with asymmetric object origins, implicit motion synchronization, +and the user's engagement in a primary task during the handover. To demonstrate +the effectiveness of our dataset, we present three models: one that generates +naturalistic handover trajectories, another that determines the appropriate +handover endpoints, and a third that predicts the moment to initiate a +handover. In a user study (N=10), we compare the handover interaction performed +with our method compared to a baseline. The findings show that our method was +perceived as significantly more natural, less physically demanding, and more +comfortable. + +
+
+ comment: CHI '25 +
+
+
+
+
+ + ☆ PathoPainter: Augmenting Histopathology Segmentation via Tumor-aware + Inpainting + + +
+ Tumor segmentation plays a critical role in histopathology, but it requires +costly, fine-grained image-mask pairs annotated by pathologists. Thus, +synthesizing histopathology data to expand the dataset is highly desirable. +Previous works suffer from inaccuracies and limited diversity in image-mask +pairs, both of which affect training segmentation, particularly in small-scale +datasets and the inherently complex nature of histopathology images. To address +this challenge, we propose PathoPainter, which reformulates image-mask pair +generation as a tumor inpainting task. Specifically, our approach preserves the +background while inpainting the tumor region, ensuring precise alignment +between the generated image and its corresponding mask. To enhance dataset +diversity while maintaining biological plausibility, we incorporate a sampling +mechanism that conditions tumor inpainting on regional embeddings from a +different image. Additionally, we introduce a filtering strategy to exclude +uncertain synthetic regions, further improving the quality of the generated +data. Our comprehensive evaluation spans multiple datasets featuring diverse +tumor types and various training data scales. As a result, segmentation +improved significantly with our synthetic data, surpassing existing +segmentation data synthesis approaches, e.g., 75.69% -> 77.69% on CAMELYON16. +The code is available at https://github.com/HongLiuuuuu/PathoPainter. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ☆ The Best of Both Worlds: Integrating Language Models and Diffusion + Models for Video Generation + + +
+ Recent advancements in text-to-video (T2V) generation have been driven by two +competing paradigms: autoregressive language models and diffusion models. +However, each paradigm has intrinsic limitations: language models struggle with +visual quality and error accumulation, while diffusion models lack semantic +understanding and causal modeling. In this work, we propose LanDiff, a hybrid +framework that synergizes the strengths of both paradigms through +coarse-to-fine generation. Our architecture introduces three key innovations: +(1) a semantic tokenizer that compresses 3D visual features into compact 1D +discrete representations through efficient semantic compression, achieving a +$\sim$14,000$\times$ compression ratio; (2) a language model that generates +semantic tokens with high-level semantic relationships; (3) a streaming +diffusion model that refines coarse semantics into high-fidelity videos. +Experiments show that LanDiff, a 5B model, achieves a score of 85.43 on the +VBench T2V benchmark, surpassing the state-of-the-art open-source models +Hunyuan Video (13B) and other commercial models such as Sora, Keling, and +Hailuo. Furthermore, our model also achieves state-of-the-art performance in +long video generation, surpassing other open-source models in this field. Our +demo can be viewed at https://landiff.github.io/. + +
+
+
+
+
+ + ☆ A Benchmark for Multi-Lingual Vision-Language Learning in Remote Sensing + Image Captioning + + +
+ Remote Sensing Image Captioning (RSIC) is a cross-modal field bridging vision +and language, aimed at automatically generating natural language descriptions +of features and scenes in remote sensing imagery. Despite significant advances +in developing sophisticated methods and large-scale datasets for training +vision-language models (VLMs), two critical challenges persist: the scarcity of +non-English descriptive datasets and the lack of multilingual capability +evaluation for models. These limitations fundamentally impede the progress and +practical deployment of RSIC, particularly in the era of large VLMs. To address +these challenges, this paper presents several significant contributions to the +field. First, we introduce and analyze BRSIC (Bilingual Remote Sensing Image +Captioning), a comprehensive bilingual dataset that enriches three established +English RSIC datasets with Chinese descriptions, encompassing 13,634 images +paired with 68,170 bilingual captions. Building upon this foundation, we +develop a systematic evaluation framework that addresses the prevalent +inconsistency in evaluation protocols, enabling rigorous assessment of model +performance through standardized retraining procedures on BRSIC. Furthermore, +we present an extensive empirical study of eight state-of-the-art large +vision-language models (LVLMs), examining their capabilities across multiple +paradigms including zero-shot inference, supervised fine-tuning, and +multi-lingual training. This comprehensive evaluation provides crucial insights +into the strengths and limitations of current LVLMs in handling multilingual +remote sensing tasks. Additionally, our cross-dataset transfer experiments +reveal interesting findings. The code and data will be available at +https://github.com/mrazhou/BRSIC. + +
+
+
+
+
+ + ☆ Omnidirectional Multi-Object Tracking CVPR 2025 + + +
+ Panoramic imagery, with its 360{\deg} field of view, offers comprehensive +information to support Multi-Object Tracking (MOT) in capturing spatial and +temporal relationships of surrounding objects. However, most MOT algorithms are +tailored for pinhole images with limited views, impairing their effectiveness +in panoramic settings. Additionally, panoramic image distortions, such as +resolution loss, geometric deformation, and uneven lighting, hinder direct +adaptation of existing MOT methods, leading to significant performance +degradation. To address these challenges, we propose OmniTrack, an +omnidirectional MOT framework that incorporates Tracklet Management to +introduce temporal cues, FlexiTrack Instances for object localization and +association, and the CircularStatE Module to alleviate image and geometric +distortions. This integration enables tracking in large field-of-view +scenarios, even under rapid sensor motion. To mitigate the lack of panoramic +MOT datasets, we introduce the QuadTrack dataset--a comprehensive panoramic +dataset collected by a quadruped robot, featuring diverse challenges such as +wide fields of view, intense motion, and complex environments. Extensive +experiments on the public JRDB dataset and the newly introduced QuadTrack +benchmark demonstrate the state-of-the-art performance of the proposed +framework. OmniTrack achieves a HOTA score of 26.92% on JRDB, representing an +improvement of 3.43%, and further achieves 23.45% on QuadTrack, surpassing the +baseline by 6.81%. The dataset and code will be made publicly available at +https://github.com/xifen523/OmniTrack. + +
+
+ comment: Accepted to CVPR 2025. The dataset and code will be made publicly + available at https://github.com/xifen523/OmniTrack +
+
+
+
+
+ + ☆ ViT-VS: On the Applicability of Pretrained Vision Transformer Features + for Generalizable Visual Servoing + + +
+ Visual servoing enables robots to precisely position their end-effector +relative to a target object. While classical methods rely on hand-crafted +features and thus are universally applicable without task-specific training, +they often struggle with occlusions and environmental variations, whereas +learning-based approaches improve robustness but typically require extensive +training. We present a visual servoing approach that leverages pretrained +vision transformers for semantic feature extraction, combining the advantages +of both paradigms while also being able to generalize beyond the provided +sample. Our approach achieves full convergence in unperturbed scenarios and +surpasses classical image-based visual servoing by up to 31.2\% relative +improvement in perturbed scenarios. Even the convergence rates of +learning-based methods are matched despite requiring no task- or +object-specific training. Real-world evaluations confirm robust performance in +end-effector positioning, industrial box manipulation, and grasping of unseen +objects using only a reference from the same category. Our code and simulation +environment are available at: https://alessandroscherl.github.io/ViT-VS/ + +
+
+
+
+
+ + ☆ In-Context Reverse Classification Accuracy: Efficient Estimation of + Segmentation Quality without Ground-Truth + + +
+ Assessing the quality of automatic image segmentation is crucial in clinical +practice, but often very challenging due to the limited availability of ground +truth annotations. In this paper, we introduce In-Context Reverse +Classification Accuracy (In-Context RCA), a novel framework for automatically +estimating segmentation quality in the absence of ground-truth annotations. By +leveraging recent in-context learning segmentation models and incorporating +retrieval-augmentation techniques to select the most relevant reference images, +our approach enables efficient quality estimation with minimal reference data. +Validated across diverse medical imaging modalities, our method demonstrates +robust performance and computational efficiency, offering a promising solution +for automated quality control in clinical workflows, where fast and reliable +segmentation assessment is essential. The code is available at +https://github.com/mcosarinsky/In-Context-RCA. + +
+
+
+
+
+ + ☆ A Novel Solution for Drone Photogrammetry with Low-overlap Aerial Images + using Monocular Depth Estimation + + +
+ Low-overlap aerial imagery poses significant challenges to traditional +photogrammetric methods, which rely heavily on high image overlap to produce +accurate and complete mapping products. In this study, we propose a novel +workflow based on monocular depth estimation to address the limitations of +conventional techniques. Our method leverages tie points obtained from aerial +triangulation to establish a relationship between monocular depth and metric +depth, thus transforming the original depth map into a metric depth map, +enabling the generation of dense depth information and the comprehensive +reconstruction of the scene. For the experiments, a high-overlap drone dataset +containing 296 images is processed using Metashape to generate depth maps and +DSMs as ground truth. Subsequently, we create a low-overlap dataset by +selecting 20 images for experimental evaluation. Results demonstrate that while +the recovered depth maps and resulting DSMs achieve meter-level accuracy, they +provide significantly better completeness compared to traditional methods, +particularly in regions covered by single images. This study showcases the +potential of monocular depth estimation in low-overlap aerial photogrammetry. + +
+
+
+
+
+ + ☆ AnyAnomaly: Zero-Shot Customizable Video Anomaly Detection with LVLM + + +
+ Video anomaly detection (VAD) is crucial for video analysis and surveillance +in computer vision. However, existing VAD models rely on learned normal +patterns, which makes them difficult to apply to diverse environments. +Consequently, users should retrain models or develop separate AI models for new +environments, which requires expertise in machine learning, high-performance +hardware, and extensive data collection, limiting the practical usability of +VAD. To address these challenges, this study proposes customizable video +anomaly detection (C-VAD) technique and the AnyAnomaly model. C-VAD considers +user-defined text as an abnormal event and detects frames containing a +specified event in a video. We effectively implemented AnyAnomaly using a +context-aware visual question answering without fine-tuning the large vision +language model. To validate the effectiveness of the proposed model, we +constructed C-VAD datasets and demonstrated the superiority of AnyAnomaly. +Furthermore, our approach showed competitive performance on VAD benchmark +datasets, achieving state-of-the-art results on the UBnormal dataset and +outperforming other methods in generalization across all datasets. Our code is +available online at github.com/SkiddieAhn/Paper-AnyAnomaly. + +
+
+
+
+
+ + ☆ IMFine: 3D Inpainting via Geometry-guided Multi-view Refinement CVPR 2025 + + +
+ Current 3D inpainting and object removal methods are largely limited to +front-facing scenes, facing substantial challenges when applied to diverse, +"unconstrained" scenes where the camera orientation and trajectory are +unrestricted. To bridge this gap, we introduce a novel approach that produces +inpainted 3D scenes with consistent visual quality and coherent underlying +geometry across both front-facing and unconstrained scenes. Specifically, we +propose a robust 3D inpainting pipeline that incorporates geometric priors and +a multi-view refinement network trained via test-time adaptation, building on a +pre-trained image inpainting model. Additionally, we develop a novel inpainting +mask detection technique to derive targeted inpainting masks from object masks, +boosting the performance in handling unconstrained scenes. To validate the +efficacy of our approach, we create a challenging and diverse benchmark that +spans a wide range of scenes. Comprehensive experiments demonstrate that our +proposed method substantially outperforms existing state-of-the-art approaches. + +
+
+ comment: Accepted at CVPR 2025, + \href{https://xinxinzuo2353.github.io/imfine/}{Project Page} +
+
+
+
+
+ + ☆ ReynoldsFlow: Exquisite Flow Estimation via Reynolds Transport Theorem + + +
+ Optical flow is a fundamental technique for motion estimation, widely applied +in video stabilization, interpolation, and object tracking. Recent advancements +in artificial intelligence (AI) have enabled deep learning models to leverage +optical flow as an important feature for motion analysis. However, traditional +optical flow methods rely on restrictive assumptions, such as brightness +constancy and slow motion constraints, limiting their effectiveness in complex +scenes. Deep learning-based approaches require extensive training on large +domain-specific datasets, making them computationally demanding. Furthermore, +optical flow is typically visualized in the HSV color space, which introduces +nonlinear distortions when converted to RGB and is highly sensitive to noise, +degrading motion representation accuracy. These limitations inherently +constrain the performance of downstream models, potentially hindering object +tracking and motion analysis tasks. To address these challenges, we propose +Reynolds flow, a novel training-free flow estimation inspired by the Reynolds +transport theorem, offering a principled approach to modeling complex motion +dynamics. Beyond the conventional HSV-based visualization, denoted +ReynoldsFlow, we introduce an alternative representation, ReynoldsFlow+, +designed to improve flow visualization. We evaluate ReynoldsFlow and +ReynoldsFlow+ across three video-based benchmarks: tiny object detection on +UAVDB, infrared object detection on Anti-UAV, and pose estimation on GolfDB. +Experimental results demonstrate that networks trained with ReynoldsFlow+ +achieve state-of-the-art (SOTA) performance, exhibiting improved robustness and +efficiency across all tasks. + +
+
+ comment: 10 pages, 3 figures, 3 tables +
+
+
+
+
+ + ☆ Spatial regularisation for improved accuracy and interpretability in + keypoint-based registration + + +
+ Unsupervised registration strategies bypass requirements in ground truth +transforms or segmentations by optimising similarity metrics between fixed and +moved volumes. Among these methods, a recent subclass of approaches based on +unsupervised keypoint detection stand out as very promising for +interpretability. Specifically, these methods train a network to predict +feature maps for fixed and moving images, from which explainable centres of +mass are computed to obtain point clouds, that are then aligned in closed-form. +However, the features returned by the network often yield spatially diffuse +patterns that are hard to interpret, thus undermining the purpose of +keypoint-based registration. Here, we propose a three-fold loss to regularise +the spatial distribution of the features. First, we use the KL divergence to +model features as point spread functions that we interpret as probabilistic +keypoints. Then, we sharpen the spatial distributions of these features to +increase the precision of the detected landmarks. Finally, we introduce a new +repulsive loss across keypoints to encourage spatial diversity. Overall, our +loss considerably improves the interpretability of the features, which now +correspond to precise and anatomically meaningful landmarks. We demonstrate our +three-fold loss in foetal rigid motion tracking and brain MRI affine +registration tasks, where it not only outperforms state-of-the-art unsupervised +strategies, but also bridges the gap with state-of-the-art supervised methods. +Our code is available at https://github.com/BenBillot/spatial_regularisation. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Learning Object Placement Programs for Indoor Scene Synthesis with + Iterative Self Training + + +
+ Data driven and autoregressive indoor scene synthesis systems generate indoor +scenes automatically by suggesting and then placing objects one at a time. +Empirical observations show that current systems tend to produce incomplete +next object location distributions. We introduce a system which addresses this +problem. We design a Domain Specific Language (DSL) that specifies functional +constraints. Programs from our language take as input a partial scene and +object to place. Upon execution they predict possible object placements. We +design a generative model which writes these programs automatically. Available +3D scene datasets do not contain programs to train on, so we build upon +previous work in unsupervised program induction to introduce a new program +bootstrapping algorithm. In order to quantify our empirical observations we +introduce a new evaluation procedure which captures how well a system models +per-object location distributions. We ask human annotators to label all the +possible places an object can go in a scene and show that our system produces +per-object location distributions more consistent with human annotators. Our +system also generates indoor scenes of comparable quality to previous systems +and while previous systems degrade in performance when training data is sparse, +our system does not degrade to the same degree. + +
+
+ comment: 21 pages, 20 figures Subjects: Graphics (cs.GR), Computer Vision and + Pattern Recognition (cs.CV), Machine Learning (cs.LG) +
+
+
+
+
+ + ☆ Semantic Alignment of Unimodal Medical Text and Vision Representations + + +
+ General-purpose AI models, particularly those designed for text and vision, +demonstrate impressive versatility across a wide range of deep-learning tasks. +However, they often underperform in specialised domains like medical imaging, +where domain-specific solutions or alternative knowledge transfer approaches +are typically required. Recent studies have noted that general-purpose models +can exhibit similar latent spaces when processing semantically related data, +although this alignment does not occur naturally. Building on this insight, it +has been shown that applying a simple transformation - at most affine - +estimated from a subset of semantically corresponding samples, known as +anchors, enables model stitching across diverse training paradigms, +architectures, and modalities. In this paper, we explore how semantic alignment +- estimating transformations between anchors - can bridge general-purpose AI +with specialised medical knowledge. Using multiple public chest X-ray datasets, +we demonstrate that model stitching across model architectures allows general +models to integrate domain-specific knowledge without additional training, +leading to improved performance on medical tasks. Furthermore, we introduce a +novel zero-shot classification approach for unimodal vision encoders that +leverages semantic alignment across modalities. Our results show that our +method not only outperforms general multimodal models but also approaches the +performance levels of fully trained, medical-specific multimodal solutions + +
+
+
+
+
+ + ☆ ForestLPR: LiDAR Place Recognition in Forests Attentioning Multiple BEV + Density Images CVPR2025 + + +
+ Place recognition is essential to maintain global consistency in large-scale +localization systems. While research in urban environments has progressed +significantly using LiDARs or cameras, applications in natural forest-like +environments remain largely under-explored. Furthermore, forests present +particular challenges due to high self-similarity and substantial variations in +vegetation growth over time. In this work, we propose a robust LiDAR-based +place recognition method for natural forests, ForestLPR. We hypothesize that a +set of cross-sectional images of the forest's geometry at different heights +contains the information needed to recognize revisiting a place. The +cross-sectional images are represented by \ac{bev} density images of horizontal +slices of the point cloud at different heights. Our approach utilizes a visual +transformer as the shared backbone to produce sets of local descriptors and +introduces a multi-BEV interaction module to attend to information at different +heights adaptively. It is followed by an aggregation layer that produces a +rotation-invariant place descriptor. We evaluated the efficacy of our method +extensively on real-world data from public benchmarks as well as robotic +datasets and compared it against the state-of-the-art (SOTA) methods. The +results indicate that ForestLPR has consistently good performance on all +evaluations and achieves an average increase of 7.38\% and 9.11\% on Recall@1 +over the closest competitor on intra-sequence loop closure detection and +inter-sequence re-localization, respectively, validating our hypothesis + +
+
+ comment: accepted by CVPR2025 +
+
+
+
+
+ + ☆ Gate-Shift-Pose: Enhancing Action Recognition in Sports with Skeleton + Information + + +
+ This paper introduces Gate-Shift-Pose, an enhanced version of Gate-Shift-Fuse +networks, designed for athlete fall classification in figure skating by +integrating skeleton pose data alongside RGB frames. We evaluate two fusion +strategies: early-fusion, which combines RGB frames with Gaussian heatmaps of +pose keypoints at the input stage, and late-fusion, which employs a +multi-stream architecture with attention mechanisms to combine RGB and pose +features. Experiments on the FR-FS dataset demonstrate that Gate-Shift-Pose +significantly outperforms the RGB-only baseline, improving accuracy by up to +40% with ResNet18 and 20% with ResNet50. Early-fusion achieves the highest +accuracy (98.08%) with ResNet50, leveraging the model's capacity for effective +multimodal integration, while late-fusion is better suited for lighter +backbones like ResNet18. These results highlight the potential of multimodal +architectures for sports action recognition and the critical role of skeleton +pose information in capturing complex motion patterns. + +
+
+
+
+
+ + ☆ Question-Aware Gaussian Experts for Audio-Visual Question Answering CVPR 2025 + + +
+ Audio-Visual Question Answering (AVQA) requires not only question-based +multimodal reasoning but also precise temporal grounding to capture subtle +dynamics for accurate prediction. However, existing methods mainly use question +information implicitly, limiting focus on question-specific details. +Furthermore, most studies rely on uniform frame sampling, which can miss key +question-relevant frames. Although recent Top-K frame selection methods aim to +address this, their discrete nature still overlooks fine-grained temporal +details. This paper proposes \textbf{QA-TIGER}, a novel framework that +explicitly incorporates question information and models continuous temporal +dynamics. Our key idea is to use Gaussian-based modeling to adaptively focus on +both consecutive and non-consecutive frames based on the question, while +explicitly injecting question information and applying progressive refinement. +We leverage a Mixture of Experts (MoE) to flexibly implement multiple Gaussian +models, activating temporal experts specifically tailored to the question. +Extensive experiments on multiple AVQA benchmarks show that QA-TIGER +consistently achieves state-of-the-art performance. Code is available at +https://github.com/AIM-SKKU/QA-TIGER + +
+
+ comment: CVPR 2025. Project page at https://aim-skku.github.io/QA-TIGER/ +
+
+
+
+
+ + ☆ TPC: Cross-Temporal Prediction Connection for Vision-Language Model + Hallucination Reduction + + +
+ Vision-language models (VLMs) have achieved remarkable advancements, +capitalizing on the impressive capabilities of large language models (LLMs) +across diverse tasks. Despite this, a critical challenge known as hallucination +occurs when models overconfidently describe objects or attributes absent from +the image, a problem exacerbated by the tendency of VLMs to rely on linguistic +priors. This limitation reduces model reliability in high-stakes applications. +In this work, we have observed the characteristic of logits' continuity +consistency enhancement and introduced a straightforward and efficient method, +Cross-Temporal Prediction Connection (TPC), designed to enhance the semantic +consistency of logits by connecting them temporally across timesteps. TPC +amplifies information flow and improves coherence, effectively reducing +hallucination. Extensive experiments show that TPC surpasses existing +representatives, delivering superior performance in both accuracy and +efficiency while maintaining robustness in open-ended text generation tasks. + +
+
+
+
+
+ + ☆ A lightweight model FDM-YOLO for small target improvement based on + YOLOv8 + + +
+ Small targets are particularly difficult to detect due to their low pixel +count, complex backgrounds, and varying shooting angles, which make it hard for +models to extract effective features. While some large-scale models offer high +accuracy, their long inference times make them unsuitable for real-time +deployment on edge devices. On the other hand, models designed for low +computational power often suffer from poor detection accuracy. This paper +focuses on small target detection and explores methods for object detection +under low computational constraints. Building on the YOLOv8 model, we propose a +new network architecture called FDM-YOLO. Our research includes the following +key contributions: We introduce FDM-YOLO by analyzing the output of the YOLOv8 +detection head. We add a highresolution layer and remove the large target +detection layer to better handle small targets. Based on PConv, we propose a +lightweight network structure called Fast-C2f, which is integrated into the PAN +module of the model. To mitigate the accuracy loss caused by model +lightweighting, we employ dynamic upsampling (Dysample) and a lightweight EMA +attention mechanism.The FDM-YOLO model was validated on the Visdrone dataset, +achieving a 38% reduction in parameter count and improving the Map0.5 score +from 38.4% to 42.5%, all while maintaining nearly the same inference speed. +This demonstrates the effectiveness of our approach in balancing accuracy and +efficiency for edge device deployment. + +
+
+
+
+
+ + ☆ ToFu: Visual Tokens Reduction via Fusion for Multi-modal, Multi-patch, + Multi-image Task + + +
+ Large Multimodal Models (LMMs) are powerful tools that are capable of +reasoning and understanding multimodal information beyond text and language. +Despite their entrenched impact, the development of LMMs is hindered by the +higher computational requirements compared to their unimodal counterparts. One +of the main causes of this is the large amount of tokens needed to encode the +visual input, which is especially evident for multi-image multimodal tasks. +Recent approaches to reduce visual tokens depend on the visual encoder +architecture, require fine-tuning the LLM to maintain the performance, and only +consider single-image scenarios. To address these limitations, we propose ToFu, +a visual encoder-agnostic, training-free Token Fusion strategy that combines +redundant visual tokens of LMMs for high-resolution, multi-image, tasks. The +core intuition behind our method is straightforward yet effective: preserve +distinctive tokens while combining similar ones. We achieve this by +sequentially examining visual tokens and deciding whether to merge them with +others or keep them as separate entities. We validate our approach on the +well-established LLaVA-Interleave Bench, which covers challenging multi-image +tasks. In addition, we push to the extreme our method by testing it on a +newly-created benchmark, ComPairs, focused on multi-image comparisons where a +larger amount of images and visual tokens are inputted to the LMMs. Our +extensive analysis, considering several LMM architectures, demonstrates the +benefits of our approach both in terms of efficiency and performance gain. + +
+
+
+
+
+ + ☆ EvidMTL: Evidential Multi-Task Learning for Uncertainty-Aware Semantic + Surface Mapping from Monocular RGB Images IROS 2025 + + +
+ For scene understanding in unstructured environments, an accurate and +uncertainty-aware metric-semantic mapping is required to enable informed action +selection by autonomous systems.Existing mapping methods often suffer from +overconfident semantic predictions, and sparse and noisy depth sensing, leading +to inconsistent map representations. In this paper, we therefore introduce +EvidMTL, a multi-task learning framework that uses evidential heads for depth +estimation and semantic segmentation, enabling uncertainty-aware inference from +monocular RGB images. To enable uncertainty-calibrated evidential multi-task +learning, we propose a novel evidential depth loss function that jointly +optimizes the belief strength of the depth prediction in conjunction with +evidential segmentation loss. Building on this, we present EvidKimera, an +uncertainty-aware semantic surface mapping framework, which uses evidential +depth and semantics prediction for improved 3D metric-semantic consistency. We +train and evaluate EvidMTL on the NYUDepthV2 and assess its zero-shot +performance on ScanNetV2, demonstrating superior uncertainty estimation +compared to conventional approaches while maintaining comparable depth +estimation and semantic segmentation. In zero-shot mapping tests on ScanNetV2, +EvidKimera outperforms Kimera in semantic surface mapping accuracy and +consistency, highlighting the benefits of uncertainty-aware mapping and +underscoring its potential for real-world robotic applications. + +
+
+ comment: Submitted to IROS 2025 Conference +
+
+
+
+
+ + ☆ PointsToWood: A deep learning framework for complete canopy leaf-wood + segmentation of TLS data across diverse European forests + + +
+ Point clouds from Terrestrial Laser Scanning (TLS) are an increasingly +popular source of data for studying plant structure and function but typically +require extensive manual processing to extract ecologically important +information. One key task is the accurate semantic segmentation of different +plant material within point clouds, particularly wood and leaves, which is +required to understand plant productivity, architecture and physiology. +Existing automated semantic segmentation methods are primarily developed for +single ecosystem types, and whilst they show good accuracy for biomass +assessment from the trunk and large branches, often perform less well within +the crown. In this study, we demonstrate a new framework that uses a deep +learning architecture newly developed from PointNet and pointNEXT for +processing 3D point clouds to provide a reliable semantic segmentation of wood +and leaf in TLS point clouds from the tree base to branch tips, trained on data +from diverse mature European forests. Our model uses meticulously labelled data +combined with voxel-based sampling, neighbourhood rescaling, and a novel gated +reflectance integration module embedded throughout the feature extraction +layers. We evaluate its performance across open datasets from boreal, +temperate, Mediterranean and tropical regions, encompassing diverse ecosystem +types and sensor characteristics. Our results show consistent outperformance +against the most widely used PointNet based approach for leaf/wood segmentation +on our high-density TLS dataset collected across diverse mixed forest plots +across all major biomes in Europe. We also find consistently strong performance +tested on others open data from China, Eastern Cameroon, Germany and Finland, +collected using both time-of-flight and phase-shift sensors, showcasing the +transferability of our model to a wide range of ecosystems and sensors. + +
+
+
+
+
+ + ☆ Learning Transformer-based World Models with Contrastive Predictive + Coding + + +
+ The DreamerV3 algorithm recently obtained remarkable performance across +diverse environment domains by learning an accurate world model based on +Recurrent Neural Networks (RNNs). Following the success of model-based +reinforcement learning algorithms and the rapid adoption of the Transformer +architecture for its superior training efficiency and favorable scaling +properties, recent works such as STORM have proposed replacing RNN-based world +models with Transformer-based world models using masked self-attention. +However, despite the improved training efficiency of these methods, their +impact on performance remains limited compared to the Dreamer algorithm, +struggling to learn competitive Transformer-based world models. In this work, +we show that the next state prediction objective adopted in previous approaches +is insufficient to fully exploit the representation capabilities of +Transformers. We propose to extend world model predictions to longer time +horizons by introducing TWISTER (Transformer-based World model wIth contraSTivE +Representations), a world model using action-conditioned Contrastive Predictive +Coding to learn high-level temporal feature representations and improve the +agent performance. TWISTER achieves a human-normalized mean score of 162% on +the Atari 100k benchmark, setting a new record among state-of-the-art methods +that do not employ look-ahead search. + +
+
+
+
+
+ + ☆ Scale-Invariant Adversarial Attack against Arbitrary-scale + Super-resolution + + +
+ The advent of local continuous image function (LIIF) has garnered significant +attention for arbitrary-scale super-resolution (SR) techniques. However, while +the vulnerabilities of fixed-scale SR have been assessed, the robustness of +continuous representation-based arbitrary-scale SR against adversarial attacks +remains an area warranting further exploration. The elaborately designed +adversarial attacks for fixed-scale SR are scale-dependent, which will cause +time-consuming and memory-consuming problems when applied to arbitrary-scale +SR. To address this concern, we propose a simple yet effective +``scale-invariant'' SR adversarial attack method with good transferability, +termed SIAGT. Specifically, we propose to construct resource-saving attacks by +exploiting finite discrete points of continuous representation. In addition, we +formulate a coordinate-dependent loss to enhance the cross-model +transferability of the attack. The attack can significantly deteriorate the SR +images while introducing imperceptible distortion to the targeted +low-resolution (LR) images. Experiments carried out on three popular LIIF-based +SR approaches and four classical SR datasets show remarkable attack performance +and transferability of SIAGT. + +
+
+ comment: 15 pages, accepted by TIFS 2025 +
+
+
+
+
+ + ☆ MIDAS: Modeling Ground-Truth Distributions with Dark Knowledge for + Domain Generalized Stereo Matching + + +
+ Despite the significant advances in domain generalized stereo matching, +existing methods still exhibit domain-specific preferences when transferring +from synthetic to real domains, hindering their practical applications in +complex and diverse scenarios. The probability distributions predicted by the +stereo network naturally encode rich similarity and uncertainty information. +Inspired by this observation, we propose to extract these two types of dark +knowledge from the pre-trained network to model intuitive multi-modal +ground-truth distributions for both edge and non-edge regions. To mitigate the +inherent domain preferences of a single network, we adopt network ensemble and +further distinguish between objective and biased knowledge in the Laplace +parameter space. Finally, the objective knowledge and the original disparity +labels are jointly modeled as a mixture of Laplacians to provide fine-grained +supervision for the stereo network training. Extensive experiments demonstrate +that: 1) Our method is generic and effectively improves the generalization of +existing networks. 2) PCWNet with our method achieves the state-of-the-art +generalization performance on both KITTI 2015 and 2012 datasets. 3) Our method +outperforms existing methods in comprehensive ranking across four popular +real-world datasets. + +
+
+
+
+
+ + ☆ ObjMST: An Object-Focused Multimodal Style Transfer Framework + + +
+ We propose ObjMST, an object-focused multimodal style transfer framework that +provides separate style supervision for salient objects and surrounding +elements while addressing alignment issues in multimodal representation +learning. Existing image-text multimodal style transfer methods face the +following challenges: (1) generating non-aligned and inconsistent multimodal +style representations; and (2) content mismatch, where identical style patterns +are applied to both salient objects and their surrounding elements. Our +approach mitigates these issues by: (1) introducing a Style-Specific Masked +Directional CLIP Loss, which ensures consistent and aligned style +representations for both salient objects and their surroundings; and (2) +incorporating a salient-to-key mapping mechanism for stylizing salient objects, +followed by image harmonization to seamlessly blend the stylized objects with +their environment. We validate the effectiveness of ObjMST through experiments, +using both quantitative metrics and qualitative visual evaluations of the +stylized outputs. Our code is available at: +https://github.com/chandagrover/ObjMST. + +
+
+ comment: 8 pages, 8 Figures, 3 Tables +
+
+
+
+
+ + ☆ PLMP -- Point-Line Minimal Problems for Projective SfM + + +
+ We completely classify all minimal problems for Structure-from-Motion (SfM) +where arrangements of points and lines are fully observed by multiple +uncalibrated pinhole cameras. We find 291 minimal problems, 73 of which have +unique solutions and can thus be solved linearly. Two of the linear problems +allow an arbitrary number of views, while all other minimal problems have at +most 9 cameras. All minimal problems have at most 7 points and at most 12 +lines. We compute the number of solutions of each minimal problem, as this +gives a measurement of the problem's intrinsic difficulty, and find that these +number are relatively low (e.g., when comparing with minimal problems for +calibrated cameras). Finally, by exploring stabilizer subgroups of +subarrangements, we develop a geometric and systematic way to 1) factorize +minimal problems into smaller problems, 2) identify minimal problems in +underconstrained problems, and 3) formally prove non-minimality. + +
+
+
+
+
+ + LEDiT: Your Length-Extrapolatable Diffusion Transformer without + Positional Encoding + + +
+ Diffusion transformers(DiTs) struggle to generate images at resolutions +higher than their training resolutions. The primary obstacle is that the +explicit positional encodings(PE), such as RoPE, need extrapolation which +degrades performance when the inference resolution differs from training. In +this paper, we propose a Length-Extrapolatable Diffusion Transformer(LEDiT), a +simple yet powerful architecture to overcome this limitation. LEDiT needs no +explicit PEs, thereby avoiding extrapolation. The key innovations of LEDiT are +introducing causal attention to implicitly impart global positional information +to tokens, while enhancing locality to precisely distinguish adjacent tokens. +Experiments on 256x256 and 512x512 ImageNet show that LEDiT can scale the +inference resolution to 512x512 and 1024x1024, respectively, while achieving +better image quality compared to current state-of-the-art length extrapolation +methods(NTK-aware, YaRN). Moreover, LEDiT achieves strong extrapolation +performance with just 100K steps of fine-tuning on a pretrained DiT, +demonstrating its potential for integration into existing text-to-image DiTs. + +
+
+
+
+
+ + ☆ GaussianVideo: Efficient Video Representation and Compression by + Gaussian Splatting + + +
+ Implicit Neural Representation for Videos (NeRV) has introduced a novel +paradigm for video representation and compression, outperforming traditional +codecs. As model size grows, however, slow encoding and decoding speed and high +memory consumption hinder its application in practice. To address these +limitations, we propose a new video representation and compression method based +on 2D Gaussian Splatting to efficiently handle video data. Our proposed +deformable 2D Gaussian Splatting dynamically adapts the transformation of 2D +Gaussians at each frame, significantly reducing memory cost. Equipped with a +multi-plane-based spatiotemporal encoder and a lightweight decoder, it predicts +changes in color, coordinates, and shape of initialized Gaussians, given the +time step. By leveraging temporal gradients, our model effectively captures +temporal redundancy at negligible cost, significantly enhancing video +representation efficiency. Our method reduces GPU memory usage by up to 78.4%, +and significantly expedites video processing, achieving 5.5x faster training +and 12.5x faster decoding compared to the state-of-the-art NeRV methods. + +
+
+
+
+
+ + ☆ GBT-SAM: A Parameter-Efficient Depth-Aware Model for Generalizable Brain + tumour Segmentation on mp-MRI + + +
+ Gliomas are brain tumours that stand out for their highly lethal and +aggressive nature, which demands a precise approach in their diagnosis. Medical +image segmentation plays a crucial role in the evaluation and follow-up of +these tumours, allowing specialists to analyse their morphology. However, +existing methods for automatic glioma segmentation often lack generalization +capability across other brain tumour domains, require extensive computational +resources, or fail to fully utilize the multi-parametric MRI (mp-MRI) data used +to delineate them. In this work, we introduce GBT-SAM, a novel Generalizable +Brain Tumour (GBT) framework that extends the Segment Anything Model (SAM) to +brain tumour segmentation tasks. Our method employs a two-step training +protocol: first, fine-tuning the patch embedding layer to process the entire +mp-MRI modalities, and second, incorporating parameter-efficient LoRA blocks +and a Depth-Condition block into the Vision Transformer (ViT) to capture +inter-slice correlations. GBT-SAM achieves state-of-the-art performance on the +Adult Glioma dataset (Dice Score of $93.54$) while demonstrating robust +generalization across Meningioma, Pediatric Glioma, and Sub-Saharan Glioma +datasets. Furthermore, GBT-SAM uses less than 6.5M trainable parameters, thus +offering an efficient solution for brain tumour segmentation. \\ Our code and +models are available at https://github.com/vpulab/med-sam-brain . + +
+
+
+
+
+ + ☆ A Modular Pipeline for 3D Object Tracking Using RGB Cameras + + +
+ Object tracking is a key challenge of computer vision with various +applications that all require different architectures. Most tracking systems +have limitations such as constraining all movement to a 2D plane and they often +track only one object. In this paper, we present a new modular pipeline that +calculates 3D trajectories of multiple objects. It is adaptable to various +settings where multiple time-synced and stationary cameras record moving +objects, using off the shelf webcams. Our pipeline was tested on the Table +Setting Dataset, where participants are recorded with various sensors as they +set a table with tableware objects. We need to track these manipulated objects, +using 6 rgb webcams. Challenges include: Detecting small objects in 9.874.699 +camera frames, determining camera poses, discriminating between nearby and +overlapping objects, temporary occlusions, and finally calculating a 3D +trajectory using the right subset of an average of 11.12.456 pixel coordinates +per 3-minute trial. We implement a robust pipeline that results in accurate +trajectories with covariance of x,y,z-position as a confidence metric. It deals +dynamically with appearing and disappearing objects, instantiating new Extended +Kalman Filters. It scales to hundreds of table-setting trials with very little +human annotation input, even with the camera poses of each trial unknown. The +code is available at https://github.com/LarsBredereke/object_tracking + +
+
+ comment: 9 pages, 11 figures, original paper not to be published anywhere else +
+
+
+
+
+ + ☆ S2Gaussian: Sparse-View Super-Resolution 3D Gaussian Splatting CVPR 2025 + + +
+ In this paper, we aim ambitiously for a realistic yet challenging problem, +namely, how to reconstruct high-quality 3D scenes from sparse low-resolution +views that simultaneously suffer from deficient perspectives and clarity. +Whereas existing methods only deal with either sparse views or low-resolution +observations, they fail to handle such hybrid and complicated scenarios. To +this end, we propose a novel Sparse-view Super-resolution 3D Gaussian Splatting +framework, dubbed S2Gaussian, that can reconstruct structure-accurate and +detail-faithful 3D scenes with only sparse and low-resolution views. The +S2Gaussian operates in a two-stage fashion. In the first stage, we initially +optimize a low-resolution Gaussian representation with depth regularization and +densify it to initialize the high-resolution Gaussians through a tailored +Gaussian Shuffle Split operation. In the second stage, we refine the +high-resolution Gaussians with the super-resolved images generated from both +original sparse views and pseudo-views rendered by the low-resolution +Gaussians. In which a customized blur-free inconsistency modeling scheme and a +3D robust optimization strategy are elaborately designed to mitigate multi-view +inconsistency and eliminate erroneous updates caused by imperfect supervision. +Extensive experiments demonstrate superior results and in particular +establishing new state-of-the-art performances with more consistent geometry +and finer details. + +
+
+ comment: CVPR 2025 +
+
+
+
+
+ + ☆ Shaken, Not Stirred: A Novel Dataset for Visual Understanding of Glasses + in Human-Robot Bartending Tasks IROS + + +
+ Datasets for object detection often do not account for enough variety of +glasses, due to their transparent and reflective properties. Specifically, +open-vocabulary object detectors, widely used in embodied robotic agents, fail +to distinguish subclasses of glasses. This scientific gap poses an issue to +robotic applications that suffer from accumulating errors between detection, +planning, and action execution. The paper introduces a novel method for the +acquisition of real-world data from RGB-D sensors that minimizes human effort. +We propose an auto-labeling pipeline that generates labels for all the acquired +frames based on the depth measurements. We provide a novel real-world glass +object dataset that was collected on the Neuro-Inspired COLlaborator (NICOL), a +humanoid robot platform. The data set consists of 7850 images recorded from +five different cameras. We show that our trained baseline model outperforms +state-of-the-art open-vocabulary approaches. In addition, we deploy our +baseline model in an embodied agent approach to the NICOL platform, on which it +achieves a success rate of 81% in a human-robot bartending scenario. + +
+
+ comment: Submitted to IEEE/RSJ International Conference on Intelligent Robots + and Systems (IROS) 2025 +
+
+
+
+
+ + ☆ ControlFill: Spatially Adjustable Image Inpainting from Prompt Learning + + +
+ In this report, I present an inpainting framework named \textit{ControlFill}, +which involves training two distinct prompts: one for generating plausible +objects within a designated mask (\textit{creation}) and another for filling +the region by extending the background (\textit{removal}). During the inference +stage, these learned embeddings guide a diffusion network that operates without +requiring heavy text encoders. By adjusting the relative significance of the +two prompts and employing classifier-free guidance, users can control the +intensity of removal or creation. Furthermore, I introduce a method to +spatially vary the intensity of guidance by assigning different scales to +individual pixels. + +
+
+
+
+
+ + ☆ TAIL: Text-Audio Incremental Learning + + +
+ Many studies combine text and audio to capture multi-modal information but +they overlook the model's generalization ability on new datasets. Introducing +new datasets may affect the feature space of the original dataset, leading to +catastrophic forgetting. Meanwhile, large model parameters can significantly +impact training performance. To address these limitations, we introduce a novel +task called Text-Audio Incremental Learning (TAIL) task for text-audio +retrieval, and propose a new method, PTAT, Prompt Tuning for Audio-Text +incremental learning. This method utilizes prompt tuning to optimize the model +parameters while incorporating an audio-text similarity and feature +distillation module to effectively mitigate catastrophic forgetting. We +benchmark our method and previous incremental learning methods on AudioCaps, +Clotho, BBC Sound Effects and Audioset datasets, and our method outperforms +previous methods significantly, particularly demonstrating stronger resistance +to forgetting on older datasets. Compared to the full-parameters Finetune +(Sequential) method, our model only requires 2.42\% of its parameters, +achieving 4.46\% higher performance. + +
+
+ comment: 4 figures, 5 tables +
+
+
+
+
+ + ☆ How to Move Your Dragon: Text-to-Motion Synthesis for Large-Vocabulary + Objects + + +
+ Motion synthesis for diverse object categories holds great potential for 3D +content creation but remains underexplored due to two key challenges: (1) the +lack of comprehensive motion datasets that include a wide range of high-quality +motions and annotations, and (2) the absence of methods capable of handling +heterogeneous skeletal templates from diverse objects. To address these +challenges, we contribute the following: First, we augment the Truebones Zoo +dataset, a high-quality animal motion dataset covering over 70 species, by +annotating it with detailed text descriptions, making it suitable for +text-based motion synthesis. Second, we introduce rig augmentation techniques +that generate diverse motion data while preserving consistent dynamics, +enabling models to adapt to various skeletal configurations. Finally, we +redesign existing motion diffusion models to dynamically adapt to arbitrary +skeletal templates, enabling motion synthesis for a diverse range of objects +with varying structures. Experiments show that our method learns to generate +high-fidelity motions from textual descriptions for diverse and even unseen +objects, setting a strong foundation for motion synthesis across diverse object +categories and skeletal templates. Qualitative results are available on this +link: t2m4lvo.github.io + +
+
+
+
+
+ + ☆ An Egocentric Vision-Language Model based Portable Real-time Smart + Assistant + + +
+ We present Vinci, a vision-language system designed to provide real-time, +comprehensive AI assistance on portable devices. At its core, Vinci leverages +EgoVideo-VL, a novel model that integrates an egocentric vision foundation +model with a large language model (LLM), enabling advanced functionalities such +as scene understanding, temporal grounding, video summarization, and future +planning. To enhance its utility, Vinci incorporates a memory module for +processing long video streams in real time while retaining contextual history, +a generation module for producing visual action demonstrations, and a retrieval +module that bridges egocentric and third-person perspectives to provide +relevant how-to videos for skill acquisition. Unlike existing systems that +often depend on specialized hardware, Vinci is hardware-agnostic, supporting +deployment across a wide range of devices, including smartphones and wearable +cameras. In our experiments, we first demonstrate the superior performance of +EgoVideo-VL on multiple public benchmarks, showcasing its vision-language +reasoning and contextual understanding capabilities. We then conduct a series +of user studies to evaluate the real-world effectiveness of Vinci, highlighting +its adaptability and usability in diverse scenarios. We hope Vinci can +establish a new framework for portable, real-time egocentric AI systems, +empowering users with contextual and actionable insights. Including the +frontend, backend, and models, all codes of Vinci are available at +https://github.com/OpenGVLab/vinci. + +
+
+
+
+
+ + ☆ Geometry-Constrained Monocular Scale Estimation Using Semantic + Segmentation for Dynamic Scenes + + +
+ Monocular visual localization plays a pivotal role in advanced driver +assistance systems and autonomous driving by estimating a vehicle's ego-motion +from a single pinhole camera. Nevertheless, conventional monocular visual +odometry encoun-ters challenges in scale estimation due to the absence of depth +information during projection. Previous methodologies, whether rooted in +physical constraints or deep learning paradigms, con-tend with issues related +to computational complexity and the management of dynamic objects. This study +extends our prior research, presenting innovative strategies for ego-motion +estima-tion and the selection of ground points. Striving for a nuanced +equilibrium between computational efficiency and precision, we propose a hybrid +method that leverages the SegNeXt model for real-time applications, +encompassing both ego-motion estimation and ground point selection. Our +methodology incorporates dy-namic object masks to eliminate unstable features +and employs ground plane masks for meticulous triangulation. Furthermore, we +exploit Geometry-constraint to delineate road regions for scale recovery. The +integration of this approach with the mo-nocular version of ORB-SLAM3 +culminates in the accurate esti-mation of a road model, a pivotal component in +our scale recov-ery process. Rigorous experiments, conducted on the KITTI +da-taset, systematically compare our method with existing monocu-lar visual +odometry algorithms and contemporary scale recovery methodologies. The results +undeniably confirm the superior ef-fectiveness of our approach, surpassing +state-of-the-art visual odometry algorithms. Our source code is available at +https://git hub.com/bFr0zNq/MVOSegScale. + +
+
+
+
+
+ + ☆ Synthetic Data is an Elegant GIFT for Continual Vision-Language Models CVPR 2025 + + +
+ Pre-trained Vision-Language Models (VLMs) require Continual Learning (CL) to +efficiently update their knowledge and adapt to various downstream tasks +without retraining from scratch. However, for VLMs, in addition to the loss of +knowledge previously learned from downstream tasks, pre-training knowledge is +also corrupted during continual fine-tuning. This issue is exacerbated by the +unavailability of original pre-training data, leaving VLM's generalization +ability degrading. In this paper, we propose GIFT, a novel continual +fine-tuning approach that utilizes synthetic data to overcome catastrophic +forgetting in VLMs. Taking advantage of recent advances in text-to-image +synthesis, we employ a pre-trained diffusion model to recreate both +pre-training and learned downstream task data. In this way, the VLM can revisit +previous knowledge through distillation on matching diffusion-generated images +and corresponding text prompts. Leveraging the broad distribution and high +alignment between synthetic image-text pairs in VLM's feature space, we propose +a contrastive distillation loss along with an image-text alignment constraint. +To further combat in-distribution overfitting and enhance distillation +performance with limited amount of generated data, we incorporate adaptive +weight consolidation, utilizing Fisher information from these synthetic +image-text pairs and achieving a better stability-plasticity balance. Extensive +experiments demonstrate that our method consistently outperforms previous +state-of-the-art approaches across various settings. + +
+
+ comment: This work is accepted by CVPR 2025. Modifications may be performed +
+
+
+
+
+ + ☆ Spiking Meets Attention: Efficient Remote Sensing Image Super-Resolution + with Attention Spiking Neural Networks + + +
+ Spiking neural networks (SNNs) are emerging as a promising alternative to +traditional artificial neural networks (ANNs), offering biological plausibility +and energy efficiency. Despite these merits, SNNs are frequently hampered by +limited capacity and insufficient representation power, yet remain +underexplored in remote sensing super-resolution (SR) tasks. In this paper, we +first observe that spiking signals exhibit drastic intensity variations across +diverse textures, highlighting an active learning state of the neurons. This +observation motivates us to apply SNNs for efficient SR of RSIs. Inspired by +the success of attention mechanisms in representing salient information, we +devise the spiking attention block (SAB), a concise yet effective component +that optimizes membrane potentials through inferred attention weights, which, +in turn, regulates spiking activity for superior feature representation. Our +key contributions include: 1) we bridge the independent modulation between +temporal and channel dimensions, facilitating joint feature correlation +learning, and 2) we access the global self-similar patterns in large-scale +remote sensing imagery to infer spatial attention weights, incorporating +effective priors for realistic and faithful reconstruction. Building upon SAB, +we proposed SpikeSR, which achieves state-of-the-art performance across various +remote sensing benchmarks such as AID, DOTA, and DIOR, while maintaining high +computational efficiency. The code of SpikeSR will be available upon paper +acceptance. + +
+
+
+
+
+ + ☆ Energy-Guided Optimization for Personalized Image Editing with + Pretrained Text-to-Image Diffusion Models + + +
+ The rapid advancement of pretrained text-driven diffusion models has +significantly enriched applications in image generation and editing. However, +as the demand for personalized content editing increases, new challenges emerge +especially when dealing with arbitrary objects and complex scenes. Existing +methods usually mistakes mask as the object shape prior, which struggle to +achieve a seamless integration result. The mostly used inversion noise +initialization also hinders the identity consistency towards the target object. +To address these challenges, we propose a novel training-free framework that +formulates personalized content editing as the optimization of edited images in +the latent space, using diffusion models as the energy function guidance +conditioned by reference text-image pairs. A coarse-to-fine strategy is +proposed that employs text energy guidance at the early stage to achieve a +natural transition toward the target class and uses point-to-point +feature-level image energy guidance to perform fine-grained appearance +alignment with the target object. Additionally, we introduce the latent space +content composition to enhance overall identity consistency with the target. +Extensive experiments demonstrate that our method excels in object replacement +even with a large domain gap, highlighting its potential for high-quality, +personalized image editing. + +
+
+
+
+
+ + ☆ Bridging the Vision-Brain Gap with an Uncertainty-Aware Blur Prior + + +
+ Can our brain signals faithfully reflect the original visual stimuli, even +including high-frequency details? Although human perceptual and cognitive +capacities enable us to process and remember visual information, these +abilities are constrained by several factors, such as limited attentional +resources and the finite capacity of visual memory. When visual stimuli are +processed by human visual system into brain signals, some information is +inevitably lost, leading to a discrepancy known as the \textbf{System GAP}. +Additionally, perceptual and cognitive dynamics, along with technical noise in +signal acquisition, degrade the fidelity of brain signals relative to the +visual stimuli, known as the \textbf{Random GAP}. When encoded brain +representations are directly aligned with the corresponding pretrained image +features, the System GAP and Random GAP between paired data challenge the +model, requiring it to bridge these gaps. However, in the context of limited +paired data, these gaps are difficult for the model to learn, leading to +overfitting and poor generalization to new data. To address these GAPs, we +propose a simple yet effective approach called the \textbf{Uncertainty-aware +Blur Prior (UBP)}. It estimates the uncertainty within the paired data, +reflecting the mismatch between brain signals and visual stimuli. Based on this +uncertainty, UBP dynamically blurs the high-frequency details of the original +images, reducing the impact of the mismatch and improving alignment. Our method +achieves a top-1 accuracy of \textbf{50.9\%} and a top-5 accuracy of +\textbf{79.7\%} on the zero-shot brain-to-image retrieval task, surpassing +previous state-of-the-art methods by margins of \textbf{13.7\%} and +\textbf{9.8\%}, respectively. Code is available at +\href{https://github.com/HaitaoWuTJU/Uncertainty-aware-Blur-Prior}{GitHub}. + +
+
+
+
+
+ + ☆ Learning 3D Medical Image Models From Brain Functional Connectivity + Network Supervision For Mental Disorder Diagnosis + + +
+ In MRI-based mental disorder diagnosis, most previous studies focus on +functional connectivity network (FCN) derived from functional MRI (fMRI). +However, the small size of annotated fMRI datasets restricts its wide +application. Meanwhile, structural MRIs (sMRIs), such as 3D T1-weighted (T1w) +MRI, which are commonly used and readily accessible in clinical settings, are +often overlooked. To integrate the complementary information from both function +and structure for improved diagnostic accuracy, we propose CINP (Contrastive +Image-Network Pre-training), a framework that employs contrastive learning +between sMRI and FCN. During pre-training, we incorporate masked image modeling +and network-image matching to enhance visual representation learning and +modality alignment. Since the CINP facilitates knowledge transfer from FCN to +sMRI, we introduce network prompting. It utilizes only sMRI from suspected +patients and a small amount of FCNs from different patient classes for +diagnosing mental disorders, which is practical in real-world clinical +scenario. The competitive performance on three mental disorder diagnosis tasks +demonstrate the effectiveness of the CINP in integrating multimodal MRI +information, as well as the potential of incorporating sMRI into clinical +diagnosis using network prompting. + +
+
+
+
+
+ + ☆ FUSE: First-Order and Second-Order Unified SynthEsis in Stochastic + Optimization + + +
+ Stochastic optimization methods have actively been playing a critical role in +modern machine learning algorithms to deliver decent performance. While +numerous works have proposed and developed diverse approaches, first-order and +second-order methods are in entirely different situations. The former is +significantly pivotal and dominating in emerging deep learning but only leads +convergence to a stationary point. However, second-order methods are less +popular due to their computational intensity in large-dimensional problems. +This paper presents a novel method that leverages both the first-order and +second-order methods in a unified algorithmic framework, termed FUSE, from +which a practical version (PV) is derived accordingly. FUSE-PV stands as a +simple yet efficient optimization method involving a switch-over between first +and second orders. Additionally, we develop different criteria that determine +when to switch. FUSE-PV has provably shown a smaller computational complexity +than SGD and Adam. To validate our proposed scheme, we present an ablation +study on several simple test functions and show a comparison with baselines for +benchmark datasets. + +
+
+ comment: 6 pages, 7 figures +
+
+
+
+
+ + ☆ MASTER: Multimodal Segmentation with Text Prompts + + +
+ RGB-Thermal fusion is a potential solution for various weather and light +conditions in challenging scenarios. However, plenty of studies focus on +designing complex modules to fuse different modalities. With the widespread +application of large language models (LLMs), valuable information can be more +effectively extracted from natural language. Therefore, we aim to leverage the +advantages of large language models to design a structurally simple and highly +adaptable multimodal fusion model architecture. We proposed MultimodAl +Segmentation with TExt PRompts (MASTER) architecture, which integrates LLM into +the fusion of RGB-Thermal multimodal data and allows complex query text to +participate in the fusion process. Our model utilizes a dual-path structure to +extract information from different modalities of images. Additionally, we +employ LLM as the core module for multimodal fusion, enabling the model to +generate learnable codebook tokens from RGB, thermal images, and textual +information. A lightweight image decoder is used to obtain semantic +segmentation results. The proposed MASTER performs exceptionally well in +benchmark tests across various automated driving scenarios, yielding promising +results. + +
+
+
+
+
+ + ☆ Conformal forecasting for surgical instrument trajectory + + +
+ Forecasting surgical instrument trajectories and predicting the next surgical +action recently started to attract attention from the research community. Both +these tasks are crucial for automation and assistance in endoscopy surgery. +Given the safety-critical nature of these tasks, reliable uncertainty +quantification is essential. Conformal prediction is a fast-growing and widely +recognized framework for uncertainty estimation in machine learning and +computer vision, offering distribution-free, theoretically valid prediction +intervals. In this work, we explore the application of standard conformal +prediction and conformalized quantile regression to estimate uncertainty in +forecasting surgical instrument motion, i.e., predicting direction and +magnitude of surgical instruments' future motion. We analyze and compare their +coverage and interval sizes, assessing the impact of multiple hypothesis +testing and correction methods. Additionally, we show how these techniques can +be employed to produce useful uncertainty heatmaps. To the best of our +knowledge, this is the first study applying conformal prediction to surgical +guidance, marking an initial step toward constructing principled prediction +intervals with formal coverage guarantees in this domain. + +
+
+
+
+
+ + ☆ DuCos: Duality Constrained Depth Super-Resolution via Foundation Model + + +
+ We introduce DuCos, a novel depth super-resolution framework grounded in +Lagrangian duality theory, offering a flexible integration of multiple +constraints and reconstruction objectives to enhance accuracy and robustness. +Our DuCos is the first to significantly improve generalization across diverse +scenarios with foundation models as prompts. The prompt design consists of two +key components: Correlative Fusion (CF) and Gradient Regulation (GR). CF +facilitates precise geometric alignment and effective fusion between prompt and +depth features, while GR refines depth predictions by enforcing consistency +with sharp-edged depth maps derived from foundation models. Crucially, these +prompts are seamlessly embedded into the Lagrangian constraint term, forming a +synergistic and principled framework. Extensive experiments demonstrate that +DuCos outperforms existing state-of-the-art methods, achieving superior +accuracy, robustness, and generalization. The source codes and pre-trained +models will be publicly available. + +
+
+
+
+
+ + ☆ The Role of Visual Modality in Multimodal Mathematical Reasoning: + Challenges and Insights + + +
+ Recent research has increasingly focused on multimodal mathematical +reasoning, particularly emphasizing the creation of relevant datasets and +benchmarks. Despite this, the role of visual information in reasoning has been +underexplored. Our findings show that existing multimodal mathematical models +minimally leverage visual information, and model performance remains largely +unaffected by changes to or removal of images in the dataset. We attribute this +to the dominance of textual information and answer options that inadvertently +guide the model to correct answers. To improve evaluation methods, we introduce +the HC-M3D dataset, specifically designed to require image reliance for +problem-solving and to challenge models with similar, yet distinct, images that +change the correct answer. In testing leading models, their failure to detect +these subtle visual differences suggests limitations in current visual +perception capabilities. Additionally, we observe that the common approach of +improving general VQA capabilities by combining various types of image encoders +does not contribute to math reasoning performance. This finding also presents a +challenge to enhancing visual reliance during math reasoning. Our benchmark and +code would be available at +\href{https://github.com/Yufang-Liu/visual_modality_role}{https://github.com/Yufang-Liu/visual\_modality\_role}. + +
+
+
+
+
+ + ☆ WeakSupCon: Weakly Supervised Contrastive Learning for Encoder + Pre-training + + +
+ Weakly supervised multiple instance learning (MIL) is a challenging task +given that only bag-level labels are provided, while each bag typically +contains multiple instances. This topic has been extensively studied in +histopathological image analysis, where labels are usually available only at +the whole slide image (WSI) level, while each whole slide image can be divided +into thousands of small image patches for training. The dominant MIL approaches +take fixed patch features as inputs to address computational constraints and +ensure model stability. These features are commonly generated by encoders +pre-trained on ImageNet, foundation encoders pre-trained on large datasets, or +through self-supervised learning on local datasets. While the self-supervised +encoder pre-training on the same dataset as downstream MIL tasks helps mitigate +domain shift and generate better features, the bag-level labels are not +utilized during the process, and the features of patches from different +categories may cluster together, reducing classification performance on MIL +tasks. Recently, pre-training with supervised contrastive learning (SupCon) has +demonstrated superior performance compared to self-supervised contrastive +learning and even end-to-end training on traditional image classification +tasks. In this paper, we propose a novel encoder pre-training method for +downstream MIL tasks called Weakly Supervised Contrastive Learning (WeakSupCon) +that utilizes bag-level labels. In our method, we employ multi-task learning +and define distinct contrastive learning losses for samples with different bag +labels. Our experiments demonstrate that the features generated using +WeakSupCon significantly enhance MIL classification performance compared to +self-supervised approaches across three datasets. + +
+
+
+
+
+ + ☆ CA-W3D: Leveraging Context-Aware Knowledge for Weakly Supervised + Monocular 3D Detection + + +
+ Weakly supervised monocular 3D detection, while less annotation-intensive, +often struggles to capture the global context required for reliable 3D +reasoning. Conventional label-efficient methods focus on object-centric +features, neglecting contextual semantic relationships that are critical in +complex scenes. In this work, we propose a Context-Aware Weak Supervision for +Monocular 3D object detection, namely CA-W3D, to address this limitation in a +two-stage training paradigm. Specifically, we first introduce a pre-training +stage employing Region-wise Object Contrastive Matching (ROCM), which aligns +regional object embeddings derived from a trainable monocular 3D encoder and a +frozen open-vocabulary 2D visual grounding model. This alignment encourages the +monocular encoder to discriminate scene-specific attributes and acquire richer +contextual knowledge. In the second stage, we incorporate a pseudo-label +training process with a Dual-to-One Distillation (D2OD) mechanism, which +effectively transfers contextual priors into the monocular encoder while +preserving spatial fidelity and maintaining computational efficiency during +inference. Extensive experiments conducted on the public KITTI benchmark +demonstrate the effectiveness of our approach, surpassing the SoTA method over +all metrics, highlighting the importance of contextual-aware knowledge in +weakly-supervised monocular 3D detection. + +
+
+ comment: The paper includes 8 pages, 6 figures and 4 tables +
+
+
+
+
+ + ☆ Robust Multi-View Learning via Representation Fusion of Sample-Level + Attention and Alignment of Simulated Perturbation + + +
+ Recently, multi-view learning (MVL) has garnered significant attention due to +its ability to fuse discriminative information from multiple views. However, +real-world multi-view datasets are often heterogeneous and imperfect, which +usually makes MVL methods designed for specific combinations of views lack +application potential and limits their effectiveness. To address this issue, we +propose a novel robust MVL method (namely RML) with simultaneous representation +fusion and alignment. Specifically, we introduce a simple yet effective +multi-view transformer fusion network where we transform heterogeneous +multi-view data into homogeneous word embeddings, and then integrate multiple +views by the sample-level attention mechanism to obtain a fused representation. +Furthermore, we propose a simulated perturbation based multi-view contrastive +learning framework that dynamically generates the noise and unusable +perturbations for simulating imperfect data conditions. The simulated noisy and +unusable data obtain two distinct fused representations, and we utilize +contrastive learning to align them for learning discriminative and robust +representations. Our RML is self-supervised and can also be applied for +downstream tasks as a regularization. In experiments, we employ it in +unsupervised multi-view clustering, noise-label classification, and as a +plug-and-play module for cross-modal hashing retrieval. Extensive comparison +experiments and ablation studies validate the effectiveness of RML. + +
+
+
+
+
+ + ☆ DM-Adapter: Domain-Aware Mixture-of-Adapters for Text-Based Person + Retrieval AAAI 2025 + + +
+ Text-based person retrieval (TPR) has gained significant attention as a +fine-grained and challenging task that closely aligns with practical +applications. Tailoring CLIP to person domain is now a emerging research topic +due to the abundant knowledge of vision-language pretraining, but challenges +still remain during fine-tuning: (i) Previous full-model fine-tuning in TPR is +computationally expensive and prone to overfitting.(ii) Existing +parameter-efficient transfer learning (PETL) for TPR lacks of fine-grained +feature extraction. To address these issues, we propose Domain-Aware +Mixture-of-Adapters (DM-Adapter), which unifies Mixture-of-Experts (MOE) and +PETL to enhance fine-grained feature representations while maintaining +efficiency. Specifically, Sparse Mixture-of-Adapters is designed in parallel to +MLP layers in both vision and language branches, where different experts +specialize in distinct aspects of person knowledge to handle features more +finely. To promote the router to exploit domain information effectively and +alleviate the routing imbalance, Domain-Aware Router is then developed by +building a novel gating function and injecting learnable domain-aware prompts. +Extensive experiments show that our DM-Adapter achieves state-of-the-art +performance, outperforming previous methods by a significant margin. + +
+
+ comment: 9 pages, 5 figures, accepted by AAAI 2025 +
+
+
+
+
+ + ☆ Robust Computer-Vision based Construction Site Detection for + Assistive-Technology Applications + + +
+ Navigating urban environments poses significant challenges for people with +disabilities, particularly those with blindness and low vision. Environments +with dynamic and unpredictable elements like construction sites are especially +challenging. Construction sites introduce hazards like uneven surfaces, +obstructive barriers, hazardous materials, and excessive noise, and they can +alter routing, complicating safe mobility. Existing assistive technologies are +limited, as navigation apps do not account for construction sites during trip +planning, and detection tools that attempt hazard recognition struggle to +address the extreme variability of construction paraphernalia. This study +introduces a novel computer vision-based system that integrates open-vocabulary +object detection, a YOLO-based scaffolding-pole detection model, and an optical +character recognition (OCR) module to comprehensively identify and interpret +construction site elements for assistive navigation. In static testing across +seven construction sites, the system achieved an overall accuracy of 88.56\%, +reliably detecting objects from 2m to 10m within a 0$^\circ$ -- 75$^\circ$ +angular offset. At closer distances (2--4m), the detection rate was 100\% at +all tested angles. At + +
+
+
+
+
+ + ☆ Real-time Spatial-temporal Traversability Assessment via Feature-based + Sparse Gaussian Process + + +
+ Terrain analysis is critical for the practical application of ground mobile +robots in real-world tasks, especially in outdoor unstructured environments. In +this paper, we propose a novel spatial-temporal traversability assessment +method, which aims to enable autonomous robots to effectively navigate through +complex terrains. Our approach utilizes sparse Gaussian processes (SGP) to +extract geometric features (curvature, gradient, elevation, etc.) directly from +point cloud scans. These features are then used to construct a high-resolution +local traversability map. Then, we design a spatial-temporal Bayesian Gaussian +kernel (BGK) inference method to dynamically evaluate traversability scores, +integrating historical and real-time data while considering factors such as +slope, flatness, gradient, and uncertainty metrics. GPU acceleration is applied +in the feature extraction step, and the system achieves real-time performance. +Extensive simulation experiments across diverse terrain scenarios demonstrate +that our method outperforms SOTA approaches in both accuracy and computational +efficiency. Additionally, we develop an autonomous navigation framework +integrated with the traversability map and validate it with a differential +driven vehicle in complex outdoor environments. Our code will be open-source +for further research and development by the community, +https://github.com/ZJU-FAST-Lab/FSGP_BGK. + +
+
+ comment: 8 pages, 10 figures +
+
+
+
+
+ + ☆ Q-PART: Quasi-Periodic Adaptive Regression with Test-time Training for + Pediatric Left Ventricular Ejection Fraction Regression CVPR 2025 + + +
+ In this work, we address the challenge of adaptive pediatric Left Ventricular +Ejection Fraction (LVEF) assessment. While Test-time Training (TTT) approaches +show promise for this task, they suffer from two significant limitations. +Existing TTT works are primarily designed for classification tasks rather than +continuous value regression, and they lack mechanisms to handle the +quasi-periodic nature of cardiac signals. To tackle these issues, we propose a +novel \textbf{Q}uasi-\textbf{P}eriodic \textbf{A}daptive \textbf{R}egression +with \textbf{T}est-time Training (Q-PART) framework. In the training stage, the +proposed Quasi-Period Network decomposes the echocardiogram into periodic and +aperiodic components within latent space by combining parameterized helix +trajectories with Neural Controlled Differential Equations. During inference, +our framework further employs a variance minimization strategy across image +augmentations that simulate common quality issues in echocardiogram +acquisition, along with differential adaptation rates for periodic and +aperiodic components. Theoretical analysis is provided to demonstrate that our +variance minimization objective effectively bounds the regression error under +mild conditions. Furthermore, extensive experiments across three pediatric age +groups demonstrate that Q-PART not only significantly outperforms existing +approaches in pediatric LVEF prediction, but also exhibits strong clinical +screening capability with high mAUROC scores (up to 0.9747) and maintains +gender-fair performance across all metrics, validating its robustness and +practical utility in pediatric echocardiography analysis. + +
+
+ comment: Accepted to CVPR 2025 +
+
+
+
+
+ + ☆ Token-Efficient Long Video Understanding for Multimodal LLMs + + +
+ Recent advances in video-based multimodal large language models (Video-LLMs) +have significantly improved video understanding by processing videos as +sequences of image frames. However, many existing methods treat frames +independently in the vision backbone, lacking explicit temporal modeling, which +limits their ability to capture dynamic patterns and efficiently handle long +videos. To address these limitations, we introduce STORM +(\textbf{S}patiotemporal \textbf{TO}ken \textbf{R}eduction for +\textbf{M}ultimodal LLMs), a novel architecture incorporating a dedicated +temporal encoder between the image encoder and the LLM. Our temporal encoder +leverages the Mamba State Space Model to integrate temporal information into +image tokens, generating enriched representations that preserve inter-frame +dynamics across the entire video sequence. This enriched encoding not only +enhances video reasoning capabilities but also enables effective token +reduction strategies, including test-time sampling and training-based temporal +and spatial pooling, substantially reducing computational demands on the LLM +without sacrificing key temporal information. By integrating these techniques, +our approach simultaneously reduces training and inference latency while +improving performance, enabling efficient and robust video understanding over +extended temporal contexts. Extensive evaluations show that STORM achieves +state-of-the-art results across various long video understanding benchmarks +(more than 5\% improvement on MLVU and LongVideoBench) while reducing the +computation costs by up to $8\times$ and the decoding latency by +2.4-2.9$\times$ for the fixed numbers of input frames. Project page is +available at https://research.nvidia.com/labs/lpr/storm + +
+
+
+
+
+ + ☆ Diff-Reg v2: Diffusion-Based Matching Matrix Estimation for Image + Matching and 3D Registration + + +
+ Establishing reliable correspondences is crucial for all registration tasks, +including 2D image registration, 3D point cloud registration, and 2D-3D +image-to-point cloud registration. However, these tasks are often complicated +by challenges such as scale inconsistencies, symmetry, and large deformations, +which can lead to ambiguous matches. Previous feature-based and +correspondence-based methods typically rely on geometric or semantic features +to generate or polish initial potential correspondences. Some methods typically +leverage specific geometric priors, such as topological preservation, to devise +diverse and innovative strategies tailored to a given enhancement goal, which +cannot be exhaustively enumerated. Additionally, many previous approaches rely +on a single-step prediction head, which can struggle with local minima in +complex matching scenarios. To address these challenges, we introduce an +innovative paradigm that leverages a diffusion model in matrix space for robust +matching matrix estimation. Our model treats correspondence estimation as a +denoising diffusion process in the matching matrix space, gradually refining +the intermediate matching matrix to the optimal one. Specifically, we apply the +diffusion model in the doubly stochastic matrix space for 3D-3D and 2D-3D +registration tasks. In the 2D image registration task, we deploy the diffusion +model in a matrix subspace where dual-softmax projection regularization is +applied. For all three registration tasks, we provide adaptive matching matrix +embedding implementations tailored to the specific characteristics of each task +while maintaining a consistent "match-to-warp" encoding pattern. Furthermore, +we adopt a lightweight design for the denoising module. In inference, once +points or image features are extracted and fixed, this module performs +multi-step denoising predictions through reverse sampling. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2403.19919 +
+
+
+
+
+ + ☆ DVM-SLAM: Decentralized Visual Monocular Simultaneous Localization and + Mapping for Multi-Agent Systems + + +
+ Cooperative Simultaneous Localization and Mapping (C-SLAM) enables multiple +agents to work together in mapping unknown environments while simultaneously +estimating their own positions. This approach enhances robustness, scalability, +and accuracy by sharing information between agents, reducing drift, and +enabling collective exploration of larger areas. In this paper, we present +Decentralized Visual Monocular SLAM (DVM-SLAM), the first open-source +decentralized monocular C-SLAM system. By only utilizing low-cost and +light-weight monocular vision sensors, our system is well suited for small +robots and micro aerial vehicles (MAVs). DVM-SLAM's real-world applicability is +validated on physical robots with a custom collision avoidance framework, +showcasing its potential in real-time multi-agent autonomous navigation +scenarios. We also demonstrate comparable accuracy to state-of-the-art +centralized monocular C-SLAM systems. We open-source our code and provide +supplementary material online. + +
+
+
+
+
+ + ☆ GAGrasp: Geometric Algebra Diffusion for Dexterous Grasping ICRA 2025 + + +
+ We propose GAGrasp, a novel framework for dexterous grasp generation that +leverages geometric algebra representations to enforce equivariance to SE(3) +transformations. By encoding the SE(3) symmetry constraint directly into the +architecture, our method improves data and parameter efficiency while enabling +robust grasp generation across diverse object poses. Additionally, we +incorporate a differentiable physics-informed refinement layer, which ensures +that generated grasps are physically plausible and stable. Extensive +experiments demonstrate the model's superior performance in generalization, +stability, and adaptability compared to existing methods. Additional details at +https://gagrasp.github.io/ + +
+
+ comment: Accepted at ICRA 2025 +
+
+
+
+
+ + ☆ Simple Self Organizing Map with Visual Transformer + + +
+ Vision Transformers (ViTs) have demonstrated exceptional performance in +various vision tasks. However, they tend to underperform on smaller datasets +due to their inherent lack of inductive biases. Current approaches address this +limitation implicitly-often by pairing ViTs with pretext tasks or by distilling +knowledge from convolutional neural networks (CNNs) to strengthen the prior. In +contrast, Self-Organizing Maps (SOMs), a widely adopted self-supervised +framework, are inherently structured to preserve topology and spatial +organization, making them a promising candidate to directly address the +limitations of ViTs in limited or small training datasets. Despite this +potential, equipping SOMs with modern deep learning architectures remains +largely unexplored. In this study, we conduct a novel exploration on how Vision +Transformers (ViTs) and Self-Organizing Maps (SOMs) can empower each other, +aiming to bridge this critical research gap. Our findings demonstrate that +these architectures can synergistically enhance each other, leading to +significantly improved performance in both unsupervised and supervised tasks. +Code will be publicly available. + +
+
+ comment: 5 pages, 4 figures. Submitted to IEEE. All experiments and code work + were performed by the first author, with the second author serving in a + PI/mentor role, guiding the progression of the work +
+
+
+
+
+ + ☆ SCSA: A Plug-and-Play Semantic Continuous-Sparse Attention for Arbitrary + Semantic Style Transfer CVPR 2025 + + +
+ Attention-based arbitrary style transfer methods, including CNN-based, +Transformer-based, and Diffusion-based, have flourished and produced +high-quality stylized images. However, they perform poorly on the content and +style images with the same semantics, i.e., the style of the corresponding +semantic region of the generated stylized image is inconsistent with that of +the style image. We argue that the root cause lies in their failure to consider +the relationship between local regions and semantic regions. To address this +issue, we propose a plug-and-play semantic continuous-sparse attention, dubbed +SCSA, for arbitrary semantic style transfer -- each query point considers +certain key points in the corresponding semantic region. Specifically, semantic +continuous attention ensures each query point fully attends to all the +continuous key points in the same semantic region that reflect the overall +style characteristics of that region; Semantic sparse attention allows each +query point to focus on the most similar sparse key point in the same semantic +region that exhibits the specific stylistic texture of that region. By +combining the two modules, the resulting SCSA aligns the overall style of the +corresponding semantic regions while transferring the vivid textures of these +regions. Qualitative and quantitative results prove that SCSA enables +attention-based arbitrary style transfer methods to produce high-quality +semantic stylized images. + +
+
+ comment: Accepted by CVPR 2025 +
+
+
+
+
+ + ☆ Fractional Correspondence Framework in Detection Transformer + + +
+ The Detection Transformer (DETR), by incorporating the Hungarian algorithm, +has significantly simplified the matching process in object detection tasks. +This algorithm facilitates optimal one-to-one matching of predicted bounding +boxes to ground-truth annotations during training. While effective, this strict +matching process does not inherently account for the varying densities and +distributions of objects, leading to suboptimal correspondences such as failing +to handle multiple detections of the same object or missing small objects. To +address this, we propose the Regularized Transport Plan (RTP). RTP introduces a +flexible matching strategy that captures the cost of aligning predictions with +ground truths to find the most accurate correspondences between these sets. By +utilizing the differentiable Sinkhorn algorithm, RTP allows for soft, +fractional matching rather than strict one-to-one assignments. This approach +enhances the model's capability to manage varying object densities and +distributions effectively. Our extensive evaluations on the MS-COCO and VOC +benchmarks demonstrate the effectiveness of our approach. RTP-DETR, surpassing +the performance of the Deform-DETR and the recently introduced DINO-DETR, +achieving absolute gains in mAP of +3.8% and +1.7%, respectively. + +
+
+
+
+
+ + ☆ WeakMedSAM: Weakly-Supervised Medical Image Segmentation via SAM with + Sub-Class Exploration and Prompt Affinity Mining + + +
+ We have witnessed remarkable progress in foundation models in vision tasks. +Currently, several recent works have utilized the segmenting anything model +(SAM) to boost the segmentation performance in medical images, where most of +them focus on training an adaptor for fine-tuning a large amount of pixel-wise +annotated medical images following a fully supervised manner. In this paper, to +reduce the labeling cost, we investigate a novel weakly-supervised SAM-based +segmentation model, namely WeakMedSAM. Specifically, our proposed WeakMedSAM +contains two modules: 1) to mitigate severe co-occurrence in medical images, a +sub-class exploration module is introduced to learn accurate feature +representations. 2) to improve the quality of the class activation maps, our +prompt affinity mining module utilizes the prompt capability of SAM to obtain +an affinity map for random-walk refinement. Our method can be applied to any +SAM-like backbone, and we conduct experiments with SAMUS and EfficientSAM. The +experimental results on three popularly-used benchmark datasets, i.e., BraTS +2019, AbdomenCT-1K, and MSD Cardiac dataset, show the promising results of our +proposed WeakMedSAM. Our code is available at +https://github.com/wanghr64/WeakMedSAM. + +
+
+
+
+
+ + ☆ Image-Based Relocalization and Alignment for Long-Term Monitoring of + Dynamic Underwater Environments + + +
+ Effective monitoring of underwater ecosystems is crucial for tracking +environmental changes, guiding conservation efforts, and ensuring long-term +ecosystem health. However, automating underwater ecosystem management with +robotic platforms remains challenging due to the complexities of underwater +imagery, which pose significant difficulties for traditional visual +localization methods. We propose an integrated pipeline that combines Visual +Place Recognition (VPR), feature matching, and image segmentation on +video-derived images. This method enables robust identification of revisited +areas, estimation of rigid transformations, and downstream analysis of +ecosystem changes. Furthermore, we introduce the SQUIDLE+ VPR Benchmark-the +first large-scale underwater VPR benchmark designed to leverage an extensive +collection of unstructured data from multiple robotic platforms, spanning time +intervals from days to years. The dataset encompasses diverse trajectories, +arbitrary overlap and diverse seafloor types captured under varying +environmental conditions, including differences in depth, lighting, and +turbidity. Our code is available at: https://github.com/bev-gorry/underloc + +
+
+
+
+
+ + ☆ Brain Tumor Detection in MRI Based on Federated Learning with YOLOv11 + + +
+ One of the primary challenges in medical diagnostics is the accurate and +efficient use of magnetic resonance imaging (MRI) for the detection of brain +tumors. But the current machine learning (ML) approaches have two major +limitations, data privacy and high latency. To solve the problem, in this work +we propose a federated learning architecture for a better accurate brain tumor +detection incorporating the YOLOv11 algorithm. In contrast to earlier methods +of centralized learning, our federated learning approach protects the +underlying medical data while supporting cooperative deep learning model +training across multiple institutions. To allow the YOLOv11 model to locate and +identify tumor areas, we adjust it to handle MRI data. To ensure robustness and +generalizability, the model is trained and tested on a wide range of MRI data +collected from several anonymous medical facilities. The results indicate that +our method significantly maintains higher accuracy than conventional +approaches. + +
+
+
+
+
+ + ☆ Instrument-Splatting: Controllable Photorealistic Reconstruction of + Surgical Instruments Using Gaussian Splatting + + +
+ Real2Sim is becoming increasingly important with the rapid development of +surgical artificial intelligence (AI) and autonomy. In this work, we propose a +novel Real2Sim methodology, \textit{Instrument-Splatting}, that leverages 3D +Gaussian Splatting to provide fully controllable 3D reconstruction of surgical +instruments from monocular surgical videos. To maintain both high visual +fidelity and manipulability, we introduce a geometry pre-training to bind +Gaussian point clouds on part mesh with accurate geometric priors and define a +forward kinematics to control the Gaussians as flexible as real instruments. +Afterward, to handle unposed videos, we design a novel instrument pose tracking +method leveraging semantics-embedded Gaussians to robustly refine per-frame +instrument poses and joint states in a render-and-compare manner, which allows +our instrument Gaussian to accurately learn textures and reach photorealistic +rendering. We validated our method on 2 publicly released surgical videos and 4 +videos collected on ex vivo tissues and green screens. Quantitative and +qualitative evaluations demonstrate the effectiveness and superiority of the +proposed method. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ Surgical Gaussian Surfels: Highly Accurate Real-time Surgical Scene + Rendering + + +
+ Accurate geometric reconstruction of deformable tissues in monocular +endoscopic video remains a fundamental challenge in robot-assisted minimally +invasive surgery. Although recent volumetric and point primitive methods based +on neural radiance fields (NeRF) and 3D Gaussian primitives have efficiently +rendered surgical scenes, they still struggle with handling artifact-free tool +occlusions and preserving fine anatomical details. These limitations stem from +unrestricted Gaussian scaling and insufficient surface alignment constraints +during reconstruction. To address these issues, we introduce Surgical Gaussian +Surfels (SGS), which transforms anisotropic point primitives into +surface-aligned elliptical splats by constraining the scale component of the +Gaussian covariance matrix along the view-aligned axis. We predict accurate +surfel motion fields using a lightweight Multi-Layer Perceptron (MLP) coupled +with locality constraints to handle complex tissue deformations. We use +homodirectional view-space positional gradients to capture fine image details +by splitting Gaussian Surfels in over-reconstructed regions. In addition, we +define surface normals as the direction of the steepest density change within +each Gaussian surfel primitive, enabling accurate normal estimation without +requiring monocular normal priors. We evaluate our method on two in-vivo +surgical datasets, where it outperforms current state-of-the-art methods in +surface geometry, normal map quality, and rendering efficiency, while remaining +competitive in real-time rendering performance. We make our code available at +https://github.com/aloma85/SurgicalGaussianSurfels + +
+
+
+
+
+ + ☆ Spatial-Temporal Perception with Causal Inference for Naturalistic + Driving Action Recognition + + +
+ Naturalistic driving action recognition is essential for vehicle cabin +monitoring systems. However, the complexity of real-world backgrounds presents +significant challenges for this task, and previous approaches have struggled +with practical implementation due to their limited ability to observe subtle +behavioral differences and effectively learn inter-frame features from video. +In this paper, we propose a novel Spatial-Temporal Perception (STP) +architecture that emphasizes both temporal information and spatial +relationships between key objects, incorporating a causal decoder to perform +behavior recognition and temporal action localization. Without requiring +multimodal input, STP directly extracts temporal and spatial distance features +from RGB video clips. Subsequently, these dual features are jointly encoded by +maximizing the expected likelihood across all possible permutations of the +factorization order. By integrating temporal and spatial features at different +scales, STP can perceive subtle behavioral changes in challenging scenarios. +Additionally, we introduce a causal-aware module to explore relationships +between video frame features, significantly enhancing detection efficiency and +performance. We validate the effectiveness of our approach using two publicly +available driver distraction detection benchmarks. The results demonstrate that +our framework achieves state-of-the-art performance. + +
+
+
+
+
+ + ☆ FREAK: Frequency-modulated High-fidelity and Real-time Audio-driven + Talking Portrait Synthesis + + +
+ Achieving high-fidelity lip-speech synchronization in audio-driven talking +portrait synthesis remains challenging. While multi-stage pipelines or +diffusion models yield high-quality results, they suffer from high +computational costs. Some approaches perform well on specific individuals with +low resources, yet still exhibit mismatched lip movements. The aforementioned +methods are modeled in the pixel domain. We observed that there are noticeable +discrepancies in the frequency domain between the synthesized talking videos +and natural videos. Currently, no research on talking portrait synthesis has +considered this aspect. To address this, we propose a FREquency-modulated, +high-fidelity, and real-time Audio-driven talKing portrait synthesis framework, +named FREAK, which models talking portraits from the frequency domain +perspective, enhancing the fidelity and naturalness of the synthesized +portraits. FREAK introduces two novel frequency-based modules: 1) the Visual +Encoding Frequency Modulator (VEFM) to couple multi-scale visual features in +the frequency domain, better preserving visual frequency information and +reducing the gap in the frequency spectrum between synthesized and natural +frames. and 2) the Audio Visual Frequency Modulator (AVFM) to help the model +learn the talking pattern in the frequency domain and improve audio-visual +synchronization. Additionally, we optimize the model in both pixel domain and +frequency domain jointly. Furthermore, FREAK supports seamless switching +between one-shot and video dubbing settings, offering enhanced flexibility. Due +to its superior performance, it can simultaneously support high-resolution +video results and real-time inference. Extensive experiments demonstrate that +our method synthesizes high-fidelity talking portraits with detailed facial +textures and precise lip synchronization in real-time, outperforming +state-of-the-art methods. + +
+
+
+
+
+ + ☆ PP-DocBee: Improving Multimodal Document Understanding Through a Bag of + Tricks + + +
+ With the rapid advancement of digitalization, various document images are +being applied more extensively in production and daily life, and there is an +increasingly urgent need for fast and accurate parsing of the content in +document images. Therefore, this report presents PP-DocBee, a novel multimodal +large language model designed for end-to-end document image understanding. +First, we develop a data synthesis strategy tailored to document scenarios in +which we build a diverse dataset to improve the model generalization. Then, we +apply a few training techniques, including dynamic proportional sampling, data +preprocessing, and OCR postprocessing strategies. Extensive evaluations +demonstrate the superior performance of PP-DocBee, achieving state-of-the-art +results on English document understanding benchmarks and even outperforming +existing open source and commercial models in Chinese document understanding. +The source code and pre-trained models are publicly available at +\href{https://github.com/PaddlePaddle/PaddleMIX}{https://github.com/PaddlePaddle/PaddleMIX}. + +
+
+
+
+
+ + ☆ H3O: Hyper-Efficient 3D Occupancy Prediction with Heterogeneous + Supervision ICRA 2025 + + +
+ 3D occupancy prediction has recently emerged as a new paradigm for holistic +3D scene understanding and provides valuable information for downstream +planning in autonomous driving. Most existing methods, however, are +computationally expensive, requiring costly attention-based 2D-3D +transformation and 3D feature processing. In this paper, we present a novel 3D +occupancy prediction approach, H3O, which features highly efficient +architecture designs that incur a significantly lower computational cost as +compared to the current state-of-the-art methods. In addition, to compensate +for the ambiguity in ground-truth 3D occupancy labels, we advocate leveraging +auxiliary tasks to complement the direct 3D supervision. In particular, we +integrate multi-camera depth estimation, semantic segmentation, and surface +normal estimation via differentiable volume rendering, supervised by +corresponding 2D labels that introduces rich and heterogeneous supervision +signals. We conduct extensive experiments on the Occ3D-nuScenes and +SemanticKITTI benchmarks that demonstrate the superiority of our proposed H3O. + +
+
+ comment: ICRA 2025 +
+
+
+
+
+ + ♻ ☆ Detecting Systematic Weaknesses in Vision Models along Predefined + Human-Understandable Dimensions + + +
+ Slice discovery methods (SDMs) are prominent algorithms for finding +systematic weaknesses in DNNs. They identify top-k semantically coherent +slices/subsets of data where a DNN-under-test has low performance. For being +directly useful, slices should be aligned with human-understandable and +relevant dimensions, which, for example, are defined by safety and domain +experts as part of the operational design domain (ODD). While SDMs can be +applied effectively on structured data, their application on image data is +complicated by the lack of semantic metadata. To address these issues, we +present an algorithm that combines foundation models for zero-shot image +classification to generate semantic metadata with methods for combinatorial +search to find systematic weaknesses in images. In contrast to existing +approaches, ours identifies weak slices that are in line with pre-defined +human-understandable dimensions. As the algorithm includes foundation models, +its intermediate and final results may not always be exact. Therefore, we +include an approach to address the impact of noisy metadata. We validate our +algorithm on both synthetic and real-world datasets, demonstrating its ability +to recover human-understandable systematic weaknesses. Furthermore, using our +approach, we identify systematic weaknesses of multiple pre-trained and +publicly available state-of-the-art computer vision DNNs. + +
+
+
+
+
+ + ♻ ☆ ZeroBench: An Impossible Visual Benchmark for Contemporary Large + Multimodal Models + + +
+ Large Multimodal Models (LMMs) exhibit major shortfalls when interpreting +images and, by some measures, have poorer spatial cognition than small children +or animals. Despite this, they attain high scores on many popular visual +benchmarks, with headroom rapidly eroded by an ongoing surge of model progress. +To address this, there is a pressing need for difficult benchmarks that remain +relevant for longer. We take this idea to its limit by introducing ZeroBench-a +lightweight visual reasoning benchmark that is entirely impossible for +contemporary frontier LMMs. Our benchmark consists of 100 manually curated +questions and 334 less difficult subquestions. We evaluate 20 LMMs on +ZeroBench, all of which score 0.0%, and rigorously analyse the errors. To +encourage progress in visual understanding, we publicly release ZeroBench. + +
+
+ comment: 20 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Back Home: A Machine Learning Approach to Seashell Classification and + Ecosystem Restoration + + +
+ In Costa Rica, an average of 5 tons of seashells are extracted from +ecosystems annually. Confiscated seashells, cannot be returned to their +ecosystems due to the lack of origin recognition. To address this issue, we +developed a convolutional neural network (CNN) specifically for seashell +identification. We built a dataset from scratch, consisting of approximately +19000 images from the Pacific and Caribbean coasts. Using this dataset, the +model achieved a classification accuracy exceeding 85%. The model has been +integrated into a user-friendly application, which has classified over 36,000 +seashells to date, delivering real-time results within 3 seconds per image. To +further enhance the system's accuracy, an anomaly detection mechanism was +incorporated to filter out irrelevant or anomalous inputs, ensuring only valid +seashell images are processed. + +
+
+
+
+
+ + ♻ ☆ A Simple and Effective Reinforcement Learning Method for Text-to-Image + Diffusion Fine-tuning + + +
+ Reinforcement learning (RL)-based fine-tuning has emerged as a powerful +approach for aligning diffusion models with black-box objectives. Proximal +policy optimization (PPO) is the most popular choice of method for policy +optimization. While effective in terms of performance, PPO is highly sensitive +to hyper-parameters and involves substantial computational overhead. REINFORCE, +on the other hand, mitigates some computational complexities such as high +memory overhead and sensitive hyper-parameter tuning, but has suboptimal +performance due to high-variance and sample inefficiency. While the variance of +the REINFORCE can be reduced by sampling multiple actions per input prompt and +using a baseline correction term, it still suffers from sample inefficiency. To +address these challenges, we systematically analyze the +efficiency-effectiveness trade-off between REINFORCE and PPO, and propose +leave-one-out PPO (LOOP), a novel RL for diffusion fine-tuning method. LOOP +combines variance reduction techniques from REINFORCE, such as sampling +multiple actions per input prompt and a baseline correction term, with the +robustness and sample efficiency of PPO via clipping and importance sampling. +Our results demonstrate that LOOP effectively improves diffusion models on +various black-box objectives, and achieves a better balance between +computational efficiency and performance. + +
+
+
+
+
+ + ♻ ☆ A Survey of Deep Learning-based Radiology Report Generation Using + Multimodal Data + + +
+ Automatic radiology report generation can alleviate the workload for +physicians and minimize regional disparities in medical resources, therefore +becoming an important topic in the medical image analysis field. It is a +challenging task, as the computational model needs to mimic physicians to +obtain information from multi-modal input data (i.e., medical images, clinical +information, medical knowledge, etc.), and produce comprehensive and accurate +reports. Recently, numerous works have emerged to address this issue using +deep-learning-based methods, such as transformers, contrastive learning, and +knowledge-base construction. This survey summarizes the key techniques +developed in the most recent works and proposes a general workflow for +deep-learning-based report generation with five main components, including +multi-modality data acquisition, data preparation, feature learning, feature +fusion and interaction, and report generation. The state-of-the-art methods for +each of these components are highlighted. Additionally, we summarize the latest +developments in large model-based methods and model explainability, along with +public datasets, evaluation methods, current challenges, and future directions +in this field. We have also conducted a quantitative comparison between +different methods in the same experimental setting. This is the most up-to-date +survey that focuses on multi-modality inputs and data fusion for radiology +report generation. The aim is to provide comprehensive and rich information for +researchers interested in automatic clinical report generation and medical +image analysis, especially when using multimodal inputs, and to assist them in +developing new algorithms to advance the field. + +
+
+
+
+
+ + ♻ ☆ LLM-wrapper: Black-Box Semantic-Aware Adaptation of Vision-Language + Models for Referring Expression Comprehension ICLR 2025 + + +
+ Vision Language Models (VLMs) have demonstrated remarkable capabilities in +various open-vocabulary tasks, yet their zero-shot performance lags behind +task-specific fine-tuned models, particularly in complex tasks like Referring +Expression Comprehension (REC). Fine-tuning usually requires 'white-box' access +to the model's architecture and weights, which is not always feasible due to +proprietary or privacy concerns. In this work, we propose LLM-wrapper, a method +for 'black-box' adaptation of VLMs for the REC task using Large Language Models +(LLMs). LLM-wrapper capitalizes on the reasoning abilities of LLMs, improved +with a light fine-tuning, to select the most relevant bounding box matching the +referring expression, from candidates generated by a zero-shot black-box VLM. +Our approach offers several advantages: it enables the adaptation of +closed-source models without needing access to their internal workings, it is +versatile as it works with any VLM, it transfers to new VLMs and datasets, and +it allows for the adaptation of an ensemble of VLMs. We evaluate LLM-wrapper on +multiple datasets using different VLMs and LLMs, demonstrating significant +performance improvements and highlighting the versatility of our method. While +LLM-wrapper is not meant to directly compete with standard white-box +fine-tuning, it offers a practical and effective alternative for black-box VLM +adaptation. Code and checkpoints are available at +https://github.com/valeoai/LLM_wrapper . + +
+
+ comment: LLM-wrapper (v3) is published as a conference paper at ICLR 2025. (v1 + was presented at EVAL-FoMo workshop, ECCV 2024.) +
+
+
+
+
+ + ♻ ☆ Human-Feedback Efficient Reinforcement Learning for Online Diffusion + Model Finetuning ICLR + + +
+ Controllable generation through Stable Diffusion (SD) fine-tuning aims to +improve fidelity, safety, and alignment with human guidance. Existing +reinforcement learning from human feedback methods usually rely on predefined +heuristic reward functions or pretrained reward models built on large-scale +datasets, limiting their applicability to scenarios where collecting such data +is costly or difficult. To effectively and efficiently utilize human feedback, +we develop a framework, HERO, which leverages online human feedback collected +on the fly during model learning. Specifically, HERO features two key +mechanisms: (1) Feedback-Aligned Representation Learning, an online training +method that captures human feedback and provides informative learning signals +for fine-tuning, and (2) Feedback-Guided Image Generation, which involves +generating images from SD's refined initialization samples, enabling faster +convergence towards the evaluator's intent. We demonstrate that HERO is 4x more +efficient in online feedback for body part anomaly correction compared to the +best existing method. Additionally, experiments show that HERO can effectively +handle tasks like reasoning, counting, personalization, and reducing NSFW +content with only 0.5K online feedback. + +
+
+ comment: Published in International Conference on Learning Representations + (ICLR) 2025 +
+
+
+
+
+ + ♻ ☆ BHViT: Binarized Hybrid Vision Transformer CVPR2025 + + +
+ Model binarization has made significant progress in enabling real-time and +energy-efficient computation for convolutional neural networks (CNN), offering +a potential solution to the deployment challenges faced by Vision Transformers +(ViTs) on edge devices. However, due to the structural differences between CNN +and Transformer architectures, simply applying binary CNN strategies to the ViT +models will lead to a significant performance drop. To tackle this challenge, +we propose BHViT, a binarization-friendly hybrid ViT architecture and its full +binarization model with the guidance of three important observations. +Initially, BHViT utilizes the local information interaction and hierarchical +feature aggregation technique from coarse to fine levels to address redundant +computations stemming from excessive tokens. Then, a novel module based on +shift operations is proposed to enhance the performance of the binary +Multilayer Perceptron (MLP) module without significantly increasing +computational overhead. In addition, an innovative attention matrix +binarization method based on quantization decomposition is proposed to evaluate +the token's importance in the binarized attention matrix. Finally, we propose a +regularization loss to address the inadequate optimization caused by the +incompatibility between the weight oscillation in the binary layers and the +Adam Optimizer. Extensive experimental results demonstrate that our proposed +algorithm achieves SOTA performance among binary ViT methods. + +
+
+ comment: Accepted by CVPR2025 +
+
+
+
+
+ + ♻ ☆ Self-supervised pre-training with diffusion model for few-shot landmark + detection in x-ray images WACV 2025 + + +
+ Deep neural networks have been extensively applied in the medical domain for +various tasks, including image classification, segmentation, and landmark +detection. However, their application is often hindered by data scarcity, both +in terms of available annotations and images. This study introduces a novel +application of denoising diffusion probabilistic models (DDPMs) to the landmark +detection task, specifically addressing the challenge of limited annotated data +in x-ray imaging. Our key innovation lies in leveraging DDPMs for +self-supervised pre-training in landmark detection, a previously unexplored +approach in this domain. This method enables accurate landmark detection with +minimal annotated training data (as few as 50 images), surpassing both ImageNet +supervised pre-training and traditional self-supervised techniques across three +popular x-ray benchmark datasets. To our knowledge, this work represents the +first application of diffusion models for self-supervised learning in landmark +detection, which may offer a valuable pre-training approach in few-shot +regimes, for mitigating data scarcity. + +
+
+ comment: Accepted at WACV 2025 +
+
+
+
+
+ + ♻ ☆ Enhancing Multimodal Medical Image Classification using Cross-Graph + Modal Contrastive Learning + + +
+ The classification of medical images is a pivotal aspect of disease +diagnosis, often enhanced by deep learning techniques. However, traditional +approaches typically focus on unimodal medical image data, neglecting the +integration of diverse non-image patient data. This paper proposes a novel +Cross-Graph Modal Contrastive Learning (CGMCL) framework for multimodal +structured data from different data domains to improve medical image +classification. The model effectively integrates both image and non-image data +by constructing cross-modality graphs and leveraging contrastive learning to +align multimodal features in a shared latent space. An inter-modality feature +scaling module further optimizes the representation learning process by +reducing the gap between heterogeneous modalities. The proposed approach is +evaluated on two datasets: a Parkinson's disease (PD) dataset and a public +melanoma dataset. Results demonstrate that CGMCL outperforms conventional +unimodal methods in accuracy, interpretability, and early disease prediction. +Additionally, the method shows superior performance in multi-class melanoma +classification. The CGMCL framework provides valuable insights into medical +image classification while offering improved disease interpretability and +predictive capabilities. + +
+
+
+
+
+ + ♻ ☆ LION-FS: Fast & Slow Video-Language Thinker as Online Video Assistant CVPR 2025 + + +
+ First-person video assistants are highly anticipated to enhance our daily +lives through online video dialogue. However, existing online video assistants +often sacrifice assistant efficacy for real-time efficiency by processing +low-frame-rate videos with coarse-grained visual features.To overcome the +trade-off between efficacy and efficiency, we propose "Fast & Slow +Video-Language Thinker" as an onLIne videO assistaNt, LION-FS, achieving +real-time, proactive, temporally accurate, and contextually precise responses. +LION-FS adopts a two-stage optimization strategy: 1)Fast Path: Routing-Based +Response Determination evaluates frame-by-frame whether an immediate response +is necessary. To enhance response determination accuracy and handle higher +frame-rate inputs efficiently, we employ Token Aggregation Routing to +dynamically fuse spatiotemporal features without increasing token numbers, +while utilizing Token Dropping Routing to eliminate redundant features. 2)Slow +Path: Multi-granularity Keyframe Augmentation optimizes keyframes during +response generation. To provide comprehensive and detailed responses beyond +atomic actions constrained by training data, fine-grained spatial features and +human-environment interaction features are extracted through multi-granular +pooling. These features are further integrated into a meticulously designed +multimodal Thinking Template to guide more precise response generation. +Comprehensive evaluations on online video tasks demonstrate that LION-FS +achieves state-of-the-art efficacy and efficiency. + +
+
+ comment: Accept to CVPR 2025, Project page: + https://github.com/JiuTian-VL/LION-FS +
+
+
+
+
+ + ♻ ☆ X-Boundary: Establishing Exact Safety Boundary to Shield LLMs from + Multi-Turn Jailbreaks without Compromising Usability + + +
+ Despite the rapid development of safety alignment techniques for LLMs, +defending against multi-turn jailbreaks is still a challenging task. In this +paper, we conduct a comprehensive comparison, revealing that some existing +defense methods can improve the robustness of LLMs against multi-turn +jailbreaks but compromise usability, i.e., reducing general capabilities or +causing the over-refusal problem. From the perspective of mechanism +interpretability of LLMs, we discover that these methods fail to establish a +boundary that exactly distinguishes safe and harmful feature representations. +Therefore, boundary-safe representations close to harmful representations are +inevitably disrupted, leading to a decline in usability. To address this issue, +we propose X-Boundary to push harmful representations away from boundary-safe +representations and obtain an exact distinction boundary. In this way, harmful +representations can be precisely erased without disrupting safe ones. +Experimental results show that X-Boundary achieves state-of-the-art defense +performance against multi-turn jailbreaks, while reducing the over-refusal rate +by about 20% and maintaining nearly complete general capability. Furthermore, +we theoretically prove and empirically verify that X-Boundary can accelerate +the convergence process during training. Please see our code at: +https://github.com/AI45Lab/X-Boundary. + +
+
+
+
+
+ + ♻ ☆ GSPR: Multimodal Place Recognition Using 3D Gaussian Splatting for + Autonomous Driving + + +
+ Place recognition is a crucial component that enables autonomous vehicles to +obtain localization results in GPS-denied environments. In recent years, +multimodal place recognition methods have gained increasing attention. They +overcome the weaknesses of unimodal sensor systems by leveraging complementary +information from different modalities. However, most existing methods explore +cross-modality correlations through feature-level or descriptor-level fusion, +suffering from a lack of interpretability. Conversely, the recently proposed 3D +Gaussian Splatting provides a new perspective on multimodal fusion by +harmonizing different modalities into an explicit scene representation. In this +paper, we propose a 3D Gaussian Splatting-based multimodal place recognition +network dubbed GSPR. It explicitly combines multi-view RGB images and LiDAR +point clouds into a spatio-temporally unified scene representation with the +proposed Multimodal Gaussian Splatting. A network composed of 3D graph +convolution and transformer is designed to extract spatio-temporal features and +global descriptors from the Gaussian scenes for place recognition. Extensive +evaluations on three datasets demonstrate that our method can effectively +leverage complementary strengths of both multi-view cameras and LiDAR, +achieving SOTA place recognition performance while maintaining solid +generalization ability. Our open-source code will be released at +https://github.com/QiZS-BIT/GSPR. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Efficient Diversity-Preserving Diffusion Alignment via Gradient-Informed + GFlowNets ICLR 2025 + + +
+ While one commonly trains large diffusion models by collecting datasets on +target downstream tasks, it is often desired to align and finetune pretrained +diffusion models with some reward functions that are either designed by experts +or learned from small-scale datasets. Existing post-training methods for reward +finetuning of diffusion models typically suffer from lack of diversity in +generated samples, lack of prior preservation, and/or slow convergence in +finetuning. Inspired by recent successes in generative flow networks +(GFlowNets), a class of probabilistic models that sample with the unnormalized +density of a reward function, we propose a novel GFlowNet method dubbed +Nabla-GFlowNet (abbreviated as \methodname), the first GFlowNet method that +leverages the rich signal in reward gradients, together with an objective +called \graddb plus its variant \resgraddb designed for prior-preserving +diffusion finetuning. We show that our proposed method achieves fast yet +diversity- and prior-preserving finetuning of Stable Diffusion, a large-scale +text-conditioned image diffusion model, on different realistic reward +functions. + +
+
+ comment: Technical Report (35 pages, 31 figures), Accepted at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ MMGDreamer: Mixed-Modality Graph for Geometry-Controllable 3D Indoor + Scene Generation AAAI 2025 + + +
+ Controllable 3D scene generation has extensive applications in virtual +reality and interior design, where the generated scenes should exhibit high +levels of realism and controllability in terms of geometry. Scene graphs +provide a suitable data representation that facilitates these applications. +However, current graph-based methods for scene generation are constrained to +text-based inputs and exhibit insufficient adaptability to flexible user +inputs, hindering the ability to precisely control object geometry. To address +this issue, we propose MMGDreamer, a dual-branch diffusion model for scene +generation that incorporates a novel Mixed-Modality Graph, visual enhancement +module, and relation predictor. The mixed-modality graph allows object nodes to +integrate textual and visual modalities, with optional relationships between +nodes. It enhances adaptability to flexible user inputs and enables meticulous +control over the geometry of objects in the generated scenes. The visual +enhancement module enriches the visual fidelity of text-only nodes by +constructing visual representations using text embeddings. Furthermore, our +relation predictor leverages node representations to infer absent relationships +between nodes, resulting in more coherent scene layouts. Extensive experimental +results demonstrate that MMGDreamer exhibits superior control of object +geometry, achieving state-of-the-art scene generation performance. Project +page: https://yangzhifeio.github.io/project/MMGDreamer. + +
+
+ comment: Accepted by AAAI 2025 Main Track +
+
+
+
+
+ + ♻ ☆ FSPGD: Rethinking Black-box Attacks on Semantic Segmentation + + +
+ Transferability, the ability of adversarial examples crafted for one model to +deceive other models, is crucial for black-box attacks. Despite advancements in +attack methods for semantic segmentation, transferability remains limited, +reducing their effectiveness in real-world applications. To address this, we +introduce the Feature Similarity Projected Gradient Descent (FSPGD) attack, a +novel black-box approach that enhances both attack performance and +transferability. Unlike conventional segmentation attacks that rely on output +predictions for gradient calculation, FSPGD computes gradients from +intermediate layer features. Specifically, our method introduces a loss +function that targets local information by comparing features between clean +images and adversarial examples, while also disrupting contextual information +by accounting for spatial relationships between objects. Experiments on Pascal +VOC 2012 and Cityscapes datasets demonstrate that FSPGD achieves superior +transferability and attack performance, establishing a new state-of-the-art +benchmark. Code is available at https://github.com/KU-AIVS/FSPGD. + +
+
+
+
+
+ + ♻ ☆ UniMLVG: Unified Framework for Multi-view Long Video Generation with + Comprehensive Control Capabilities for Autonomous Driving + + +
+ The creation of diverse and realistic driving scenarios has become essential +to enhance perception and planning capabilities of the autonomous driving +system. However, generating long-duration, surround-view consistent driving +videos remains a significant challenge. To address this, we present UniMLVG, a +unified framework designed to generate extended street multi-perspective videos +under precise control. By integrating single- and multi-view driving videos +into the training data, our approach updates a DiT-based diffusion model +equipped with cross-frame and cross-view modules across three stages with multi +training objectives, substantially boosting the diversity and quality of +generated visual content. Importantly, we propose an innovative explicit +viewpoint modeling approach for multi-view video generation to effectively +improve motion transition consistency. Capable of handling various input +reference formats (e.g., text, images, or video), our UniMLVG generates +high-quality multi-view videos according to the corresponding condition +constraints such as 3D bounding boxes or frame-level text descriptions. +Compared to the best models with similar capabilities, our framework achieves +improvements of 48.2% in FID and 35.2% in FVD. + +
+
+
+
+
+ + ♻ ☆ Mocap-2-to-3: Lifting 2D Diffusion-Based Pretrained Models for 3D Motion + Capture + + +
+ Recovering absolute poses in the world coordinate system from monocular views +presents significant challenges. Two primary issues arise in this context. +Firstly, existing methods rely on 3D motion data for training, which requires +collection in limited environments. Acquiring such 3D labels for new actions in +a timely manner is impractical, severely restricting the model's generalization +capabilities. In contrast, 2D poses are far more accessible and easier to +obtain. Secondly, estimating a person's absolute position in metric space from +a single viewpoint is inherently more complex. To address these challenges, we +introduce Mocap-2-to-3, a novel framework that decomposes intricate 3D motions +into 2D poses, leveraging 2D data to enhance 3D motion reconstruction in +diverse scenarios and accurately predict absolute positions in the world +coordinate system. We initially pretrain a single-view diffusion model with +extensive 2D data, followed by fine-tuning a multi-view diffusion model for +view consistency using publicly available 3D data. This strategy facilitates +the effective use of large-scale 2D data. Additionally, we propose an +innovative human motion representation that decouples local actions from global +movements and encodes geometric priors of the ground, ensuring the generative +model learns accurate motion priors from 2D data. During inference, this allows +for the gradual recovery of global movements, resulting in more plausible +positioning. We evaluate our model's performance on real-world datasets, +demonstrating superior accuracy in motion and absolute human positioning +compared to state-of-the-art methods, along with enhanced generalization and +scalability. Our code will be made publicly available. + +
+
+
+
+
+ + ♻ ☆ MobileViM: A Light-weight and Dimension-independent Vision Mamba for 3D + Medical Image Analysis + + +
+ Efficient evaluation of three-dimensional (3D) medical images is crucial for +diagnostic and therapeutic practices in healthcare. Recent years have seen a +substantial uptake in applying deep learning and computer vision to analyse and +interpret medical images. Traditional approaches, such as convolutional neural +networks (CNNs) and vision transformers (ViTs), face significant computational +challenges, prompting the need for architectural advancements. Recent efforts +have led to the introduction of novel architectures like the ``Mamba'' model as +alternative solutions to traditional CNNs or ViTs. The Mamba model excels in +the linear processing of one-dimensional data with low computational demands. +However, Mamba's potential for 3D medical image analysis remains underexplored +and could face significant computational challenges as the dimension increases. +This manuscript presents MobileViM, a streamlined architecture for efficient +segmentation of 3D medical images. In the MobileViM network, we invent a new +dimension-independent mechanism and a dual-direction traversing approach to +incorporate with a vision-Mamba-based framework. MobileViM also features a +cross-scale bridging technique to improve efficiency and accuracy across +various medical imaging modalities. With these enhancements, MobileViM achieves +segmentation speeds exceeding 90 frames per second (FPS) on a single graphics +processing unit (i.e., NVIDIA RTX 4090). This performance is over 24 FPS faster +than the state-of-the-art deep learning models for processing 3D images with +the same computational resources. In addition, experimental evaluations +demonstrate that MobileViM delivers superior performance, with Dice similarity +scores reaching 92.72%, 86.69%, 80.46%, and 77.43% for PENGWIN, BraTS2024, +ATLAS, and Toothfairy2 datasets, respectively, which significantly surpasses +existing models. + +
+
+ comment: The corresponding author disagrees with the manuscript submitted to + arXiv +
+
+
+
+
+ + ♻ ☆ FRNet: Frustum-Range Networks for Scalable LiDAR Segmentation + + +
+ LiDAR segmentation has become a crucial component of advanced autonomous +driving systems. Recent range-view LiDAR segmentation approaches show promise +for real-time processing. However, they inevitably suffer from corrupted +contextual information and rely heavily on post-processing techniques for +prediction refinement. In this work, we propose FRNet, a simple yet powerful +method aimed at restoring the contextual information of range image pixels +using corresponding frustum LiDAR points. First, a frustum feature encoder +module is used to extract per-point features within the frustum region, which +preserves scene consistency and is critical for point-level predictions. Next, +a frustum-point fusion module is introduced to update per-point features +hierarchically, enabling each point to extract more surrounding information +through the frustum features. Finally, a head fusion module is used to fuse +features at different levels for final semantic predictions. Extensive +experiments conducted on four popular LiDAR segmentation benchmarks under +various task setups demonstrate the superiority of FRNet. Notably, FRNet +achieves 73.3% and 82.5% mIoU scores on the testing sets of SemanticKITTI and +nuScenes. While achieving competitive performance, FRNet operates 5 times +faster than state-of-the-art approaches. Such high efficiency opens up new +possibilities for more scalable LiDAR segmentation. The code has been made +publicly available at https://github.com/Xiangxu-0103/FRNet. + +
+
+ comment: TIP 2025; 18 pages, 11 figures, 14 tables; Code at + https://github.com/Xiangxu-0103/FRNet +
+
+
+
+
+ + ♻ ☆ A Dataset and Benchmark for Shape Completion of Fruits for Agricultural + Robotics + + +
+ As the world population is expected to reach 10 billion by 2050, our +agricultural production system needs to double its productivity despite a +decline of human workforce in the agricultural sector. Autonomous robotic +systems are one promising pathway to increase productivity by taking over +labor-intensive manual tasks like fruit picking. To be effective, such systems +need to monitor and interact with plants and fruits precisely, which is +challenging due to the cluttered nature of agricultural environments causing, +for example, strong occlusions. Thus, being able to estimate the complete 3D +shapes of objects in presence of occlusions is crucial for automating +operations such as fruit harvesting. In this paper, we propose the first +publicly available 3D shape completion dataset for agricultural vision systems. +We provide an RGB-D dataset for estimating the 3D shape of fruits. +Specifically, our dataset contains RGB-D frames of single sweet peppers in lab +conditions but also in a commercial greenhouse. For each fruit, we additionally +collected high-precision point clouds that we use as ground truth. For +acquiring the ground truth shape, we developed a measuring process that allows +us to record data of real sweet pepper plants, both in the lab and in the +greenhouse with high precision, and determine the shape of the sensed fruits. +We release our dataset, consisting of almost 7,000 RGB-D frames belonging to +more than 100 different fruits. We provide segmented RGB-D frames, with camera +intrinsics to easily obtain colored point clouds, together with the +corresponding high-precision, occlusion-free point clouds obtained with a +high-precision laser scanner. We additionally enable evaluation of shape +completion approaches on a hidden test set through a public challenge on a +benchmark server. + +
+
+
+
+
+ + ♻ ☆ Towards Effective and Sparse Adversarial Attack on Spiking Neural + Networks via Breaking Invisible Surrogate Gradients CVPR 2025 + + +
+ Spiking neural networks (SNNs) have shown their competence in handling +spatial-temporal event-based data with low energy consumption. Similar to +conventional artificial neural networks (ANNs), SNNs are also vulnerable to +gradient-based adversarial attacks, wherein gradients are calculated by +spatial-temporal back-propagation (STBP) and surrogate gradients (SGs). +However, the SGs may be invisible for an inference-only model as they do not +influence the inference results, and current gradient-based attacks are +ineffective for binary dynamic images captured by the dynamic vision sensor +(DVS). While some approaches addressed the issue of invisible SGs through +universal SGs, their SGs lack a correlation with the victim model, resulting in +sub-optimal performance. Moreover, the imperceptibility of existing SNN-based +binary attacks is still insufficient. In this paper, we introduce an innovative +potential-dependent surrogate gradient (PDSG) method to establish a robust +connection between the SG and the model, thereby enhancing the adaptability of +adversarial attacks across various models with invisible SGs. Additionally, we +propose the sparse dynamic attack (SDA) to effectively attack binary dynamic +images. Utilizing a generation-reduction paradigm, SDA can fully optimize the +sparsity of adversarial perturbations. Experimental results demonstrate that +our PDSG and SDA outperform state-of-the-art SNN-based attacks across various +models and datasets. Specifically, our PDSG achieves 100% attack success rate +on ImageNet, and our SDA obtains 82% attack success rate by modifying only +0.24% of the pixels on CIFAR10DVS. The code is available at +https://github.com/ryime/PDSG-SDA . + +
+
+ comment: Accepted by CVPR 2025 +
+
+
+
+
+ + ♻ ☆ Continual Learning in 3D Point Clouds: Employing Spectral Techniques for + Exemplar Selection WACV 2025 + + +
+ We introduce a novel framework for Continual Learning in 3D object +classification. Our approach, CL3D, is based on the selection of prototypes +from each class using spectral clustering. For non-Euclidean data such as point +clouds, spectral clustering can be employed as long as one can define a +distance measure between pairs of samples. Choosing the appropriate distance +measure enables us to leverage 3D geometric characteristics to identify +representative prototypes for each class. We explore the effectiveness of +clustering in the input space (3D points), local feature space +(1024-dimensional points), and global feature space. We conduct experiments on +the ModelNet40, ShapeNet, and ScanNet datasets, achieving state-of-the-art +accuracy exclusively through the use of input space features. By leveraging the +combined input, local, and global features, we have improved the +state-of-the-art on ModelNet and ShapeNet, utilizing nearly half the memory +used by competing approaches. For the challenging ScanNet dataset, our method +enhances accuracy by 4.1% while consuming just 28% of the memory used by our +competitors, demonstrating the scalability of our approach. + +
+
+ comment: Accepted to WACV 2025, Tucson, Arizona, USA +
+
+
+
+
+ + ♻ ☆ Implantable Adaptive Cells: A Novel Enhancement for Pre-Trained U-Nets + in Medical Image Segmentation + + +
+ This paper introduces a novel approach to enhance the performance of +pre-trained neural networks in medical image segmentation using gradient-based +Neural Architecture Search (NAS) methods. We present the concept of Implantable +Adaptive Cell (IAC), small modules identified through Partially-Connected DARTS +based approach, designed to be injected into the skip connections of an +existing and already trained U-shaped model. Unlike traditional NAS methods, +our approach refines existing architectures without full retraining. +Experiments on four medical datasets with MRI and CT images show consistent +accuracy improvements on various U-Net configurations, with segmentation +accuracy gain by approximately 5 percentage points across all validation +datasets, with improvements reaching up to 11\%pt in the best-performing cases. +The findings of this study not only offer a cost-effective alternative to the +complete overhaul of complex models for performance upgrades but also indicate +the potential applicability of our method to other architectures and problem +domains. + +
+
+
+
+
+ + ♻ ☆ Structured Preference Optimization for Vision-Language Long-Horizon Task + Planning + + +
+ Existing methods for vision-language task planning excel in short-horizon +tasks but often fall short in complex, long-horizon planning within dynamic +environments. These challenges primarily arise from the difficulty of +effectively training models to produce high-quality reasoning processes for +long-horizon tasks. To address this, we propose Structured Preference +Optimization (SPO), which aims to enhance reasoning and action selection in +long-horizon task planning through structured preference evaluation and +optimized training strategies. Specifically, SPO introduces: 1) +Preference-Based Scoring and Optimization, which systematically evaluates +reasoning chains based on task relevance, visual grounding, and historical +consistency; and 2) Curriculum-Guided Training, where the model progressively +adapts from simple to complex tasks, improving its generalization ability in +long-horizon scenarios and enhancing reasoning robustness. To advance research +in vision-language long-horizon task planning, we introduce ExtendaBench, a +comprehensive benchmark covering 1,509 tasks across VirtualHome and Habitat +2.0, categorized into ultra-short, short, medium, and long tasks. Experimental +results demonstrate that SPO significantly improves reasoning quality and final +decision accuracy, outperforming prior methods on long-horizon tasks and +underscoring the effectiveness of preference-driven optimization in +vision-language task planning. Specifically, SPO achieves a +5.98% GCR and ++4.68% SR improvement in VirtualHome and a +3.30% GCR and +2.11% SR improvement +in Habitat over the best-performing baselines. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ♻ ☆ Enhancing Vietnamese VQA through Curriculum Learning on Raw and + Augmented Text Representations AAAI-25 + + +
+ Visual Question Answering (VQA) is a multimodal task requiring reasoning +across textual and visual inputs, which becomes particularly challenging in +low-resource languages like Vietnamese due to linguistic variability and the +lack of high-quality datasets. Traditional methods often rely heavily on +extensive annotated datasets, computationally expensive pipelines, and large +pre-trained models, specifically in the domain of Vietnamese VQA, limiting +their applicability in such scenarios. To address these limitations, we propose +a training framework that combines a paraphrase-based feature augmentation +module with a dynamic curriculum learning strategy. Explicitly, augmented +samples are considered "easy" while raw samples are regarded as "hard". The +framework then utilizes a mechanism that dynamically adjusts the ratio of easy +to hard samples during training, progressively modifying the same dataset to +increase its difficulty level. By enabling gradual adaptation to task +complexity, this approach helps the Vietnamese VQA model generalize well, thus +improving overall performance. Experimental results show consistent +improvements on the OpenViVQA dataset and mixed outcomes on the ViVQA dataset, +highlighting both the potential and challenges of our approach in advancing VQA +for Vietnamese language. + +
+
+ comment: 10 pages, 3 figures, AAAI-25 Workshop on Document Understanding and + Intelligence +
+
+
+
+
+ + ♻ ☆ Pathfinder for Low-altitude Aircraft with Binary Neural Network + + +
+ A prior global topological map (e.g., the OpenStreetMap, OSM) can boost the +performance of autonomous mapping by a ground mobile robot. However, the prior +map is usually incomplete due to lacking labeling in partial paths. To solve +this problem, this paper proposes an OSM maker using airborne sensors carried +by low-altitude aircraft, where the core of the OSM maker is a novel efficient +pathfinder approach based on LiDAR and camera data, i.e., a binary dual-stream +road segmentation model. Specifically, a multi-scale feature extraction based +on the UNet architecture is implemented for images and point clouds. To reduce +the effect caused by the sparsity of point cloud, an attention-guided gated +block is designed to integrate image and point-cloud features. To optimize the +model for edge deployment that significantly reduces storage footprint and +computational demands, we propose a binarization streamline to each model +component, including a variant of vision transformer (ViT) architecture as the +encoder of the image branch, and new focal and perception losses to optimize +the model training. The experimental results on two datasets demonstrate that +our pathfinder method achieves SOTA accuracy with high efficiency in finding +paths from the low-level airborne sensors, and we can create complete OSM prior +maps based on the segmented road skeletons. Code and data are available at: +\href{https://github.com/IMRL/Pathfinder}{https://github.com/IMRL/Pathfinder}. + +
+
+
+
+
+ + ♻ ☆ Deep unrolling for learning optimal spatially varying regularisation + parameters for Total Generalised Variation + + +
+ We extend a recently introduced deep unrolling framework for learning +spatially varying regularisation parameters in inverse imaging problems to the +case of Total Generalised Variation (TGV). The framework combines a deep +convolutional neural network (CNN) inferring the two spatially varying TGV +parameters with an unrolled algorithmic scheme that solves the corresponding +variational problem. The two subnetworks are jointly trained end-to-end in a +supervised fashion and as such the CNN learns to compute those parameters that +drive the reconstructed images as close to the ground truth as possible. +Numerical results in image denoising and MRI reconstruction show a significant +qualitative and quantitative improvement compared to the best TGV scalar +parameter case as well as to other approaches employing spatially varying +parameters computed by unsupervised methods. We also observe that the inferred +spatially varying parameter maps have a consistent structure near the image +edges, asking for further theoretical investigations. In particular, the +parameter that weighs the first-order TGV term has a triple-edge structure with +alternating high-low-high values whereas the one that weighs the second-order +term attains small values in a large neighbourhood around the edges. + +
+
+
+
+
+ + ♻ ☆ InfoDisent: Explainability of Image Classification Models by Information + Disentanglement + + +
+ In this work, we introduce InfoDisent, a hybrid approach to explainability +based on the information bottleneck principle. InfoDisent enables the +disentanglement of information in the final layer of any pretrained model into +atomic concepts, which can be interpreted as prototypical parts. This approach +merges the flexibility of post-hoc methods with the concept-level modeling +capabilities of self-explainable neural networks, such as ProtoPNets. We +demonstrate the effectiveness of InfoDisent through computational experiments +and user studies across various datasets using modern backbones such as ViTs +and convolutional networks. Notably, InfoDisent generalizes the prototypical +parts approach to novel domains (ImageNet). + +
+
+
+
+
+ + ♻ ☆ A Backbone for Long-Horizon Robot Task Understanding + + +
+ End-to-end robot learning, particularly for long-horizon tasks, often results +in unpredictable outcomes and poor generalization. To address these challenges, +we propose a novel Therblig-Based Backbone Framework (TBBF) as a fundamental +structure to enhance interpretability, data efficiency, and generalization in +robotic systems. TBBF utilizes expert demonstrations to enable therblig-level +task decomposition, facilitate efficient action-object mapping, and generate +adaptive trajectories for new scenarios. The approach consists of two stages: +offline training and online testing. During the offline training stage, we +developed the Meta-RGate SynerFusion (MGSF) network for accurate therblig +segmentation across various tasks. In the online testing stage, after a +one-shot demonstration of a new task is collected, our MGSF network extracts +high-level knowledge, which is then encoded into the image using Action +Registration (ActionREG). Additionally, Large Language Model (LLM)-Alignment +Policy for Visual Correction (LAP-VC) is employed to ensure precise action +registration, facilitating trajectory transfer in novel robot scenarios. +Experimental results validate these methods, achieving 94.37% recall in +therblig segmentation and success rates of 94.4% and 80% in real-world online +robot testing for simple and complex scenarios, respectively. Supplementary +material is available at: +https://sites.google.com/view/therbligsbasedbackbone/home + +
+
+ comment: 8 pages, 8 figures. This work has been published by IEEE Robotics and + Automation Letters (RA-L) +
+
+
+
+
+ + ♻ ☆ DongbaMIE: A Multimodal Information Extraction Dataset for Evaluating + Semantic Understanding of Dongba Pictograms + + +
+ Dongba pictographs are the only pictographs still in use in the world. They +have pictorial ideographic features, and their symbols carry rich cultural and +contextual information. Due to the lack of relevant datasets, existing research +has difficulty in advancing the study of semantic understanding of Dongba +pictographs. To this end, we propose DongbaMIE, the first multimodal dataset +for semantic understanding and extraction of Dongba pictographs. The dataset +consists of Dongba pictograph images and their corresponding Chinese semantic +annotations. It contains 23,530 sentence-level and 2,539 paragraph-level +images, covering four semantic dimensions: objects, actions, relations, and +attributes. We systematically evaluate the GPT-4o, Gemini-2.0, and Qwen2-VL +models. Experimental results show that the F1 scores of GPT-4o and Gemini in +the best object extraction are only 3.16 and 3.11 respectively. The F1 score of +Qwen2-VL after supervised fine-tuning is only 11.49. These results suggest that +current large multimodal models still face significant challenges in accurately +recognizing the diverse semantic information in Dongba pictographs. The dataset +can be obtained from this URL. + +
+
+
+
+
+ + ♻ ☆ Federated Learning With Individualized Privacy Through Client Sampling ICML + + +
+ With growing concerns about user data collection, individualized privacy has +emerged as a promising solution to balance protection and utility by accounting +for diverse user privacy preferences. Instead of enforcing a uniform level of +anonymization for all users, this approach allows individuals to choose privacy +settings that align with their comfort levels. Building on this idea, we +propose an adapted method for enabling Individualized Differential Privacy +(IDP) in Federated Learning (FL) by handling clients according to their +personal privacy preferences. By extending the SAMPLE algorithm from +centralized settings to FL, we calculate client-specific sampling rates based +on their heterogeneous privacy budgets and integrate them into a modified +IDP-FedAvg algorithm. We test this method under realistic privacy distributions +and multiple datasets. The experimental results demonstrate that our approach +achieves clear improvements over uniform DP baselines, reducing the trade-off +between privacy and utility. Compared to the alternative SCALE method in +related work, which assigns differing noise scales to clients, our method +performs notably better. However, challenges remain for complex tasks with +non-i.i.d. data, primarily stemming from the constraints of the decentralized +setting. + +
+
+ comment: Accepted at 10th International Conference on Machine Learning + Technologies (ICMLT 2025) +
+
+
+
+
+ + ♻ ☆ Modulating CNN Features with Pre-Trained ViT Representations for + Open-Vocabulary Object Detection + + +
+ Owing to large-scale image-text contrastive training, pre-trained vision +language model (VLM) like CLIP shows superior open-vocabulary recognition +ability. Most existing open-vocabulary object detectors attempt to utilize the +pre-trained VLMs to attain generalized representation. F-ViT uses the +pre-trained visual encoder as the backbone network and freezes it during +training. However, its frozen backbone doesn't benefit from the labeled data to +strengthen the representation for detection. Therefore, we propose a novel +two-branch backbone network, named as \textbf{V}iT-Feature-\textbf{M}odulated +Multi-Scale \textbf{C}onvolutional Network (VMCNet), which consists of a +trainable convolutional branch, a frozen pre-trained ViT branch and a VMC +module. The trainable CNN branch could be optimized with labeled data while the +frozen pre-trained ViT branch could keep the representation ability derived +from large-scale pre-training. Then, the proposed VMC module could modulate the +multi-scale CNN features with the representations from ViT branch. With this +proposed mixed structure, the detector is more likely to discover objects of +novel categories. Evaluated on two popular benchmarks, our method boosts the +detection performance on novel category and outperforms state-of-the-art +methods. On OV-COCO, the proposed method achieves 44.3 +AP$_{50}^{\mathrm{novel}}$ with ViT-B/16 and 48.5 AP$_{50}^{\mathrm{novel}}$ +with ViT-L/14. On OV-LVIS, VMCNet with ViT-B/16 and ViT-L/14 reaches 27.8 and +38.4 mAP$_{r}$. + +
+
+
+
+
+ + ♻ ☆ $σ$-zero: Gradient-based Optimization of $\ell_0$-norm Adversarial + Examples ICLR 2025 + + +
+ Evaluating the adversarial robustness of deep networks to gradient-based +attacks is challenging. While most attacks consider $\ell_2$- and +$\ell_\infty$-norm constraints to craft input perturbations, only a few +investigate sparse $\ell_1$- and $\ell_0$-norm attacks. In particular, +$\ell_0$-norm attacks remain the least studied due to the inherent complexity +of optimizing over a non-convex and non-differentiable constraint. However, +evaluating adversarial robustness under these attacks could reveal weaknesses +otherwise left untested with more conventional $\ell_2$- and $\ell_\infty$-norm +attacks. In this work, we propose a novel $\ell_0$-norm attack, called +$\sigma$-zero, which leverages a differentiable approximation of the $\ell_0$ +norm to facilitate gradient-based optimization, and an adaptive projection +operator to dynamically adjust the trade-off between loss minimization and +perturbation sparsity. Extensive evaluations using MNIST, CIFAR10, and ImageNet +datasets, involving robust and non-robust models, show that +$\sigma$\texttt{-zero} finds minimum $\ell_0$-norm adversarial examples without +requiring any time-consuming hyperparameter tuning, and that it outperforms all +competing sparse attacks in terms of success rate, perturbation size, and +efficiency. + +
+
+ comment: Paper accepted at International Conference on Learning + Representations (ICLR 2025). Code available at + https://github.com/sigma0-advx/sigma-zero +
+
+
+
+
+ + ♻ ☆ VISION-XL: High Definition Video Inverse Problem Solver using Latent + Image Diffusion Models + + +
+ In this paper, we propose a novel framework for solving high-definition video +inverse problems using latent image diffusion models. Building on recent +advancements in spatio-temporal optimization for video inverse problems using +image diffusion models, our approach leverages latent-space diffusion models to +achieve enhanced video quality and resolution. To address the high +computational demands of processing high-resolution frames, we introduce a +pseudo-batch consistent sampling strategy, allowing efficient operation on a +single GPU. Additionally, to improve temporal consistency, we present +pseudo-batch inversion, an initialization technique that incorporates +informative latents from the measurement. By integrating with SDXL, our +framework achieves state-of-the-art video reconstruction across a wide range of +spatio-temporal inverse problems, including complex combinations of frame +averaging and various spatial degradations, such as deblurring, +super-resolution, and inpainting. Unlike previous methods, our approach +supports multiple aspect ratios (landscape, vertical, and square) and delivers +HD-resolution reconstructions (exceeding 1280x720) in under 6 seconds per frame +on a single NVIDIA 4090 GPU. + +
+
+ comment: Project page: https://vision-xl.github.io/ +
+
+
+
+
+ + ♻ ☆ No More Sliding Window: Efficient 3D Medical Image Segmentation with + Differentiable Top-k Patch Sampling + + +
+ 3D models surpass 2D models in CT/MRI segmentation by effectively capturing +inter-slice relationships. However, the added depth dimension substantially +increases memory consumption. While patch-based training alleviates memory +constraints, it significantly slows down the inference speed due to the sliding +window (SW) approach. We propose No-More-Sliding-Window (NMSW), a novel +end-to-end trainable framework that enhances the efficiency of generic 3D +segmentation backbone during an inference step by eliminating the need for SW. +NMSW employs a differentiable Top-k module to selectively sample only the most +relevant patches, thereby minimizing redundant computations. When patch-level +predictions are insufficient, the framework intelligently leverages coarse +global predictions to refine results. Evaluated across 3 tasks using 3 +segmentation backbones, NMSW achieves competitive accuracy compared to SW +inference while significantly reducing computational complexity by 91% (88.0 to +8.00 TMACs). Moreover, it delivers a 9.1x faster inference on the H100 GPU +(99.0 to 8.3 sec) and a 11.1x faster inference on the Xeon Gold CPU (2110 to +189 sec). NMSW is model-agnostic, further boosting efficiency when integrated +with any existing efficient segmentation backbones. + +
+
+
+
+
+ + ♻ ☆ OmniGuard: Hybrid Manipulation Localization via Augmented Versatile Deep + Image Watermarking CVPR 2025 + + +
+ With the rapid growth of generative AI and its widespread application in +image editing, new risks have emerged regarding the authenticity and integrity +of digital content. Existing versatile watermarking approaches suffer from +trade-offs between tamper localization precision and visual quality. +Constrained by the limited flexibility of previous framework, their localized +watermark must remain fixed across all images. Under AIGC-editing, their +copyright extraction accuracy is also unsatisfactory. To address these +challenges, we propose OmniGuard, a novel augmented versatile watermarking +approach that integrates proactive embedding with passive, blind extraction for +robust copyright protection and tamper localization. OmniGuard employs a hybrid +forensic framework that enables flexible localization watermark selection and +introduces a degradation-aware tamper extraction network for precise +localization under challenging conditions. Additionally, a lightweight +AIGC-editing simulation layer is designed to enhance robustness across global +and local editing. Extensive experiments show that OmniGuard achieves superior +fidelity, robustness, and flexibility. Compared to the recent state-of-the-art +approach EditGuard, our method outperforms it by 4.25dB in PSNR of the +container image, 20.7% in F1-Score under noisy conditions, and 14.8% in average +bit accuracy. + +
+
+ comment: Accepted by CVPR 2025 +
+
+
+
+
+ + ♻ ☆ MIAdapt: Source-free Few-shot Domain Adaptive Object Detection for + Microscopic Images + + +
+ Existing generic unsupervised domain adaptation approaches require access to +both a large labeled source dataset and a sufficient unlabeled target dataset +during adaptation. However, collecting a large dataset, even if unlabeled, is a +challenging and expensive endeavor, especially in medical imaging. In addition, +constraints such as privacy issues can result in cases where source data is +unavailable. Taking in consideration these challenges, we propose MIAdapt, an +adaptive approach for Microscopic Imagery Adaptation as a solution for +Source-free Few-shot Domain Adaptive Object detection (SF-FSDA). We also define +two competitive baselines (1) Faster-FreeShot and (2) MT-FreeShot. Extensive +experiments on the challenging M5-Malaria and Raabin-WBC datasets validate the +effectiveness of MIAdapt. Without using any image from the source domain +MIAdapt surpasses state-of-the-art source-free UDA (SF-UDA) methods by +21.3% +mAP and few-shot domain adaptation (FSDA) approaches by +4.7% mAP on +Raabin-WBC. Our code and models will be publicly available. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Indirect Gradient Matching for Adversarial Robust Distillation ICLR 2025 + + +
+ Adversarial training significantly improves adversarial robustness, but +superior performance is primarily attained with large models. This substantial +performance gap for smaller models has spurred active research into adversarial +distillation (AD) to mitigate the difference. Existing AD methods leverage the +teacher's logits as a guide. In contrast to these approaches, we aim to +transfer another piece of knowledge from the teacher, the input gradient. In +this paper, we propose a distillation module termed Indirect Gradient +Distillation Module (IGDM) that indirectly matches the student's input gradient +with that of the teacher. Experimental results show that IGDM seamlessly +integrates with existing AD methods, significantly enhancing their performance. +Particularly, utilizing IGDM on the CIFAR-100 dataset improves the AutoAttack +accuracy from 28.06% to 30.32% with the ResNet-18 architecture and from 26.18% +to 29.32% with the MobileNetV2 architecture when integrated into the SOTA +method without additional data augmentation. + +
+
+ comment: ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Drag Your Gaussian: Effective Drag-Based Editing with Score Distillation + for 3D Gaussian Splatting + + +
+ Recent advancements in 3D scene editing have been propelled by the rapid +development of generative models. Existing methods typically utilize generative +models to perform text-guided editing on 3D representations, such as 3D +Gaussian Splatting (3DGS). However, these methods are often limited to texture +modifications and fail when addressing geometric changes, such as editing a +character's head to turn around. Moreover, such methods lack accurate control +over the spatial position of editing results, as language struggles to +precisely describe the extent of edits. To overcome these limitations, we +introduce DYG, an effective 3D drag-based editing method for 3D Gaussian +Splatting. It enables users to conveniently specify the desired editing region +and the desired dragging direction through the input of 3D masks and pairs of +control points, thereby enabling precise control over the extent of editing. +DYG integrates the strengths of the implicit triplane representation to +establish the geometric scaffold of the editing results, effectively overcoming +suboptimal editing outcomes caused by the sparsity of 3DGS in the desired +editing regions. Additionally, we incorporate a drag-based Latent Diffusion +Model into our method through the proposed Drag-SDS loss function, enabling +flexible, multi-view consistent, and fine-grained editing. Extensive +experiments demonstrate that DYG conducts effective drag-based editing guided +by control point prompts, surpassing other baselines in terms of editing effect +and quality, both qualitatively and quantitatively. Visit our project page at +https://quyans.github.io/Drag-Your-Gaussian. + +
+
+ comment: Visit our project page at https://quyans.github.io/Drag-Your-Gaussian +
+
+
+
+
+ + ♻ ☆ GIFT: Unlocking Full Potential of Labels in Distilled Dataset at + Near-zero Cost + + +
+ Recent advancements in dataset distillation have demonstrated the significant +benefits of employing soft labels generated by pre-trained teacher models. In +this paper, we introduce a novel perspective by emphasizing the full +utilization of labels. We first conduct a comprehensive comparison of various +loss functions for soft label utilization in dataset distillation, revealing +that the model trained on the synthetic dataset exhibits high sensitivity to +the choice of loss function for soft label utilization. This finding highlights +the necessity of a universal loss function for training models on synthetic +datasets. Building on these insights, we introduce an extremely simple yet +surprisingly effective plug-and-play approach, GIFT, which encompasses soft +label refinement and a cosine similarity-based loss function to efficiently +leverage full label information. Extensive experiments indicate that GIFT +consistently enhances state-of-the-art dataset distillation methods across +various dataset scales, without incurring additional computational costs. +Importantly, GIFT significantly enhances cross-optimizer generalization, an +area previously overlooked. For instance, on ImageNet-1K with IPC = 10, GIFT +enhances the state-of-the-art method RDED by 30.8% in cross-optimizer +generalization. Our code is available at https://github.com/LINs-lab/GIFT. + +
+
+ comment: https://github.com/LINs-lab/GIFT +
+
+
+
+
+ + ♻ ☆ Manta: Enhancing Mamba for Few-Shot Action Recognition of Long + Sub-Sequence AAAI 2025 + + +
+ In few-shot action recognition (FSAR), long sub-sequences of video naturally +express entire actions more effectively. However, the high computational +complexity of mainstream Transformer-based methods limits their application. +Recent Mamba demonstrates efficiency in modeling long sequences, but directly +applying Mamba to FSAR overlooks the importance of local feature modeling and +alignment. Moreover, long sub-sequences within the same class accumulate +intra-class variance, which adversely impacts FSAR performance. To solve these +challenges, we propose a Matryoshka MAmba and CoNtrasTive LeArning framework +(Manta). Firstly, the Matryoshka Mamba introduces multiple Inner Modules to +enhance local feature representation, rather than directly modeling global +features. An Outer Module captures dependencies of timeline between these local +features for implicit temporal alignment. Secondly, a hybrid contrastive +learning paradigm, combining both supervised and unsupervised methods, is +designed to mitigate the negative effects of intra-class variance accumulation. +The Matryoshka Mamba and the hybrid contrastive learning paradigm operate in +two parallel branches within Manta, enhancing Mamba for FSAR of long +sub-sequence. Manta achieves new state-of-the-art performance on prominent +benchmarks, including SSv2, Kinetics, UCF101, and HMDB51. Extensive empirical +studies prove that Manta significantly improves FSAR of long sub-sequence from +multiple perspectives. + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Locate Anything on Earth: Advancing Open-Vocabulary Object Detection for + Remote Sensing Community + + +
+ Object detection, particularly open-vocabulary object detection, plays a +crucial role in Earth sciences, such as environmental monitoring, natural +disaster assessment, and land-use planning. However, existing open-vocabulary +detectors, primarily trained on natural-world images, struggle to generalize to +remote sensing images due to a significant data domain gap. Thus, this paper +aims to advance the development of open-vocabulary object detection in remote +sensing community. To achieve this, we first reformulate the task as Locate +Anything on Earth (LAE) with the goal of detecting any novel concepts on Earth. +We then developed the LAE-Label Engine which collects, auto-annotates, and +unifies up to 10 remote sensing datasets creating the LAE-1M - the first +large-scale remote sensing object detection dataset with broad category +coverage. Using the LAE-1M, we further propose and train the novel LAE-DINO +Model, the first open-vocabulary foundation object detector for the LAE task, +featuring Dynamic Vocabulary Construction (DVC) and Visual-Guided Text Prompt +Learning (VisGT) modules. DVC dynamically constructs vocabulary for each +training batch, while VisGT maps visual features to semantic space, enhancing +text features. We comprehensively conduct experiments on established remote +sensing benchmark DIOR, DOTAv2.0, as well as our newly introduced 80-class +LAE-80C benchmark. Results demonstrate the advantages of the LAE-1M dataset and +the effectiveness of the LAE-DINO method. + +
+
+ comment: 15 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ SSL4EO-S12 v1.1: A Multimodal, Multiseasonal Dataset for Pretraining, + Updated + + +
+ This technical report presents SSL4EO-S12 v1.1, a multimodal, multitemporal +Earth Observation dataset designed for pretraining large-scale foundation +models. Building on the success of SSL4EO-S12 v1.0, the new version addresses +the previous challenges of data misalignment and a limited data structure for +low-barrier, analysis-ready EO processing. SSL4EO-S12 v1.1 covers the world's +10,000 largest cities and its surroundings within a 50 km radius across four +seasons, resulting in a diverse collection of nearly one million patches. +SSL4EO-S12 v1.1 packages the data in Zarr file format for cloud-efficient +loading and representation of meta-information such as including cloud masks +and geolocation. Released under the CC-BY-4.0 license, SSL4EO-S12 v1.1 +facilitates open research and provides a robust foundation for future +advancements in self-supervised learning and geospatial analysis. The dataset +is available online through https://datapub.fz-juelich.de/ssl4eo-s12, and we +provided additional resources at https://github.com/DLR-MF-DAS/SSL4EO-S12-v1.1. + +
+
+
+
+
+ + ♻ ☆ StoryTeller: Improving Long Video Description through Global + Audio-Visual Character Identification + + +
+ Existing large vision-language models (LVLMs) are largely limited to +processing short, seconds-long videos and struggle with generating coherent +descriptions for extended video spanning minutes or more. Long video +description introduces new challenges, such as consistent character +identification and plot-level descriptions incorporating both visual and audio +information. To address these, we figure out audio-visual character +identification, matching character names to each dialogue, as a key factor. We +propose StoryTeller, a system for generating dense descriptions of long videos, +incorporating both low-level visual concepts and high-level plot information. +StoryTeller uses a multimodal large language model that integrates visual, +audio, and text modalities to perform audio-visual character identification on +minute-long video clips. The results are then fed into a LVLM to enhance +consistency of video description. We validate our approach on movie description +tasks and introduce MovieStory101, a dataset with dense descriptions for +three-minute movie clips. To evaluate long video descriptions, we create +StoryQA, a large set of multiple-choice questions for MovieStory101 test set. +We assess descriptions by inputting them into GPT-4 to answer these questions, +using accuracy as an automatic evaluation metric. Experiments show that +StoryTeller outperforms all open and closed-source baselines on StoryQA, +achieving 9.5% higher accuracy than the strongest baseline, Gemini-1.5-pro, and +demonstrating a +15.56% advantage in human side-by-side evaluations. +Additionally, incorporating audio-visual character identification from +StoryTeller improves the performance of all video description models, with +Gemini-1.5-pro and GPT-4o showing relative improvement of 5.5% and 13.0%, +respectively, in accuracy on StoryQA. + +
+
+
+
+
+ + ♻ ☆ Rethinking Weight-Averaged Model-merging + + +
+ Model-merging has emerged as a powerful approach in deep learning, capable of +enhancing model performance without any training. However, the underlying +mechanisms that explain its effectiveness remain largely unexplored. In this +paper, we investigate this technique from three novel perspectives to +empirically provide deeper insights into why and how weight-averaged +model-merging works: (1) we examine the intrinsic patterns captured by the +learning of the model weights, through the visualizations of their patterns on +several datasets, showing that these weights often encode structured and +interpretable patterns and that is the essential why model-merging can work; +(2) we mathematically and empirically investigate model ensemble merging +strategies based on averaging on weights versus averaging on features, +providing detailed analyses across diverse architectures and datasets; and (3) +we explore the impact on model-merging prediction stability in terms of +changing the parameter magnitude, revealing insights into the way of weight +averaging works as regularization by showing the robustness across different +parameter scales. Our findings shed light on the "black box" of weight-averaged +model-merging, offering valuable insights and practical recommendations that +advance the model-merging process. The code is available at +https://github.com/billhhh/Rethink-Merge. + +
+
+
+
+
+ + ♻ ☆ Explaining Caption-Image Interactions in CLIP models with Second-Order + Attributions + + +
+ Dual encoder architectures like CLIP models map two types of inputs into a +shared embedding space and predict similarities between them. Despite their +success, it is, however, not understood how these models compare their two +inputs. Common first-order feature-attribution methods can only provide limited +insights into dual-encoders since their predictions depend on +feature-interactions rather than on individual features. In this paper, we +first derive a second-order method enabling the attribution of predictions by +any differentiable dual encoder onto feature-interactions between its inputs. +Second, we apply our method to CLIP models and show that they learn +fine-grained correspondences between parts of captions and regions in images. +They match objects across input modes also account for mismatches. This +visual-linguistic grounding ability, however, varies heavily between object +classes and exhibits pronounced out-of-domain effects. We can identify +individual errors as well as systematic failure categories including object +coverage, unusual scenes and correlated contexts. + +
+
+
+
+
+ + ♻ ☆ Meta-Learned Modality-Weighted Knowledge Distillation for Robust + Multi-Modal Learning with Missing Data + + +
+ In multi-modal learning, some modalities are more influential than others, +and their absence can have a significant impact on classification/segmentation +accuracy. Addressing this challenge, we propose a novel approach called +Meta-learned Modality-weighted Knowledge Distillation (MetaKD), which enables +multi-modal models to maintain high accuracy even when key modalities are +missing. MetaKD adaptively estimates the importance weight of each modality +through a meta-learning process. These learned importance weights guide a +pairwise modality-weighted knowledge distillation process, allowing +high-importance modalities to transfer knowledge to lower-importance ones, +resulting in robust performance despite missing inputs. Unlike previous methods +in the field, which are often task-specific and require significant +modifications, our approach is designed to work in multiple tasks (e.g., +segmentation and classification) with minimal adaptation. Experimental results +on five prevalent datasets, including three Brain Tumor Segmentation datasets +(BraTS2018, BraTS2019 and BraTS2020), the Alzheimer's Disease Neuroimaging +Initiative (ADNI) classification dataset and the Audiovision-MNIST +classification dataset, demonstrate the proposed model is able to outperform +the compared models by a large margin. The code is available at +https://github.com/billhhh/MetaKD. + +
+
+
+
+
+ + ♻ ☆ Efficient Masked AutoEncoder for Video Object Counting and A Large-Scale + Benchmark ICLR25 + + +
+ The dynamic imbalance of the fore-background is a major challenge in video +object counting, which is usually caused by the sparsity of target objects. +This remains understudied in existing works and often leads to severe +under-/over-prediction errors. To tackle this issue in video object counting, +we propose a density-embedded Efficient Masked Autoencoder Counting (E-MAC) +framework in this paper. To empower the model's representation ability on +density regression, we develop a new $\mathtt{D}$ensity-$\mathtt{E}$mbedded +$\mathtt{M}$asked m$\mathtt{O}$deling ($\mathtt{DEMO}$) method, which first +takes the density map as an auxiliary modality to perform multimodal +self-representation learning for image and density map. Although +$\mathtt{DEMO}$ contributes to effective cross-modal regression guidance, it +also brings in redundant background information, making it difficult to focus +on the foreground regions. To handle this dilemma, we propose an efficient +spatial adaptive masking derived from density maps to boost efficiency. +Meanwhile, we employ an optical flow-based temporal collaborative fusion +strategy to effectively capture the dynamic variations across frames, aligning +features to derive multi-frame density residuals. The counting accuracy of the +current frame is boosted by harnessing the information from adjacent frames. In +addition, considering that most existing datasets are limited to human-centric +scenarios, we first propose a large video bird counting dataset, DroneBird, in +natural scenarios for migratory bird protection. Extensive experiments on three +crowd datasets and our \textit{DroneBird} validate our superiority against the +counterparts. The code and dataset are available. + +
+
+ comment: ICLR25 +
+
+
+
+
+ + ♻ ☆ LaVin-DiT: Large Vision Diffusion Transformer CVPR 2025 + + +
+ This paper presents the Large Vision Diffusion Transformer (LaVin-DiT), a +scalable and unified foundation model designed to tackle over 20 computer +vision tasks in a generative framework. Unlike existing large vision models +directly adapted from natural language processing architectures, which rely on +less efficient autoregressive techniques and disrupt spatial relationships +essential for vision data, LaVin-DiT introduces key innovations to optimize +generative performance for vision tasks. First, to address the high +dimensionality of visual data, we incorporate a spatial-temporal variational +autoencoder that encodes data into a continuous latent space. Second, for +generative modeling, we develop a joint diffusion transformer that +progressively produces vision outputs. Third, for unified multi-task training, +in-context learning is implemented. Input-target pairs serve as task context, +which guides the diffusion transformer to align outputs with specific tasks +within the latent space. During inference, a task-specific context set and test +data as queries allow LaVin-DiT to generalize across tasks without fine-tuning. +Trained on extensive vision datasets, the model is scaled from 0.1B to 3.4B +parameters, demonstrating substantial scalability and state-of-the-art +performance across diverse vision tasks. This work introduces a novel pathway +for large vision foundation models, underscoring the promising potential of +diffusion transformers. The code and models are available. + +
+
+ comment: 37 pages, 30 figures, 4 tables. Accepted by CVPR 2025 +
+
+
+
+
+ + ♻ ☆ Self-Adaptive Gamma Context-Aware SSM-based Model for Metal Defect + Detection + + +
+ Metal defect detection is critical in industrial quality assurance, yet +existing methods struggle with grayscale variations and complex defect states, +limiting its robustness. To address these challenges, this paper proposes a +Self-Adaptive Gamma Context-Aware SSM-based model(GCM-DET). This advanced +detection framework integrating a Dynamic Gamma Correction (GC) module to +enhance grayscale representation and optimize feature extraction for precise +defect reconstruction. A State-Space Search Management (SSM) architecture +captures robust multi-scale features, effectively handling defects of varying +shapes and scales. Focal Loss is employed to mitigate class imbalance and +refine detection accuracy. Additionally, the CD5-DET dataset is introduced, +specifically designed for port container maintenance, featuring significant +grayscale variations and intricate defect patterns. Experimental results +demonstrate that the proposed model achieves substantial improvements, with +mAP@0.5 gains of 27.6\%, 6.6\%, and 2.6\% on the CD5-DET, NEU-DET, and GC10-DET +datasets. + +
+
+ comment: 19 pages, 9 figures, under review +
+
+
+
+
+ + ♻ ☆ DRACO-DehazeNet: An Efficient Image Dehazing Network Combining Detail + Recovery and a Novel Contrastive Learning Paradigm + + +
+ Image dehazing is crucial for clarifying images obscured by haze or fog, but +current learning-based approaches is dependent on large volumes of training +data and hence consumed significant computational power. Additionally, their +performance is often inadequate under non-uniform or heavy haze. To address +these challenges, we developed the Detail Recovery And Contrastive DehazeNet, +which facilitates efficient and effective dehazing via a dense dilated inverted +residual block and an attention-based detail recovery network that tailors +enhancements to specific dehazed scene contexts. A major innovation is its +ability to train effectively with limited data, achieved through a novel +quadruplet loss-based contrastive dehazing paradigm. This approach distinctly +separates hazy and clear image features while also distinguish lower-quality +and higher-quality dehazed images obtained from each sub-modules of our +network, thereby refining the dehazing process to a larger extent. Extensive +tests on a variety of benchmarked haze datasets demonstrated the superiority of +our approach. The code repository for this work is available at +https://github.com/GreedYLearner1146/DRACO-DehazeNet. + +
+
+ comment: Once the paper is accepted and published, the copyright will be + transferred to the corresponding journal +
+
+
+
+
+ + ♻ ☆ Novel Pipeline for Diagnosing Acute Lymphoblastic Leukemia Sensitive to + Related Biomarkers + + +
+ Acute Lymphoblastic Leukemia (ALL) is one of the most common types of +childhood blood cancer. The quick start of the treatment process is critical to +saving the patient's life, and for this reason, early diagnosis of this disease +is essential. Examining the blood smear images of these patients is one of the +methods used by expert doctors to diagnose this disease. Deep learning-based +methods have numerous applications in medical fields, as they have +significantly advanced in recent years. ALL diagnosis is not an exception in +this field, and several machine learning-based methods for this problem have +been proposed. In previous methods, high diagnostic accuracy was reported, but +our work showed that this alone is not sufficient, as it can lead to models +taking shortcuts and not making meaningful decisions. This issue arises due to +the small size of medical training datasets. To address this, we constrained +our model to follow a pipeline inspired by experts' work. We also demonstrated +that, since a judgement based on only one image is insufficient, redefining the +problem as a multiple-instance learning problem is necessary for achieving a +practical result. Our model is the first to provide a solution to this problem +in a multiple-instance learning setup. We introduced a novel pipeline for +diagnosing ALL that approximates the process used by hematologists, is +sensitive to disease biomarkers, and achieves an accuracy of 96.15%, an +F1-score of 94.24%, a sensitivity of 97.56%, and a specificity of 90.91% on ALL +IDB 1. Our method was further evaluated on an out-of-distribution dataset, +which posed a challenging test and had acceptable performance. Notably, our +model was trained on a relatively small dataset, highlighting the potential for +our approach to be applied to other medical datasets with limited data +availability. + +
+
+
+
+
+ + ♻ ☆ Dur360BEV: A Real-world 360-degree Single Camera Dataset and Benchmark + for Bird-Eye View Mapping in Autonomous Driving + + +
+ We present Dur360BEV, a novel spherical camera autonomous driving dataset +equipped with a high-resolution 128-channel 3D LiDAR and a RTK-refined GNSS/INS +system, along with a benchmark architecture designed to generate Bird-Eye-View +(BEV) maps using only a single spherical camera. This dataset and benchmark +address the challenges of BEV generation in autonomous driving, particularly by +reducing hardware complexity through the use of a single 360-degree camera +instead of multiple perspective cameras. Within our benchmark architecture, we +propose a novel spherical-image-to-BEV module that leverages spherical imagery +and a refined sampling strategy to project features from 2D to 3D. Our approach +also includes an innovative application of focal loss, specifically adapted to +address the extreme class imbalance often encountered in BEV segmentation +tasks, that demonstrates improved segmentation performance on the Dur360BEV +dataset. The results show that our benchmark not only simplifies the sensor +setup but also achieves competitive performance. + +
+
+
+
+
+ + ♻ ☆ DexMimicGen: Automated Data Generation for Bimanual Dexterous + Manipulation via Imitation Learning ICRA 2025 + + +
+ Imitation learning from human demonstrations is an effective means to teach +robots manipulation skills. But data acquisition is a major bottleneck in +applying this paradigm more broadly, due to the amount of cost and human effort +involved. There has been significant interest in imitation learning for +bimanual dexterous robots, like humanoids. Unfortunately, data collection is +even more challenging here due to the challenges of simultaneously controlling +multiple arms and multi-fingered hands. Automated data generation in simulation +is a compelling, scalable alternative to fuel this need for data. To this end, +we introduce DexMimicGen, a large-scale automated data generation system that +synthesizes trajectories from a handful of human demonstrations for humanoid +robots with dexterous hands. We present a collection of simulation environments +in the setting of bimanual dexterous manipulation, spanning a range of +manipulation behaviors and different requirements for coordination among the +two arms. We generate 21K demos across these tasks from just 60 source human +demos and study the effect of several data generation and policy learning +decisions on agent performance. Finally, we present a real-to-sim-to-real +pipeline and deploy it on a real-world humanoid can sorting task. Generated +datasets, simulation environments and additional results are at +https://dexmimicgen.github.io/ + +
+
+ comment: ICRA 2025. Project website: https://dexmimicgen.github.io/ +
+
+
+
+
+ + ♻ ☆ LangGas: Introducing Language in Selective Zero-Shot Background + Subtraction for Semi-Transparent Gas Leak Detection with a New Dataset + + +
+ Gas leakage poses a significant hazard that requires prevention. +Traditionally, human inspection has been used for detection, a slow and +labour-intensive process. Recent research has applied machine learning +techniques to this problem, yet there remains a shortage of high-quality, +publicly available datasets. This paper introduces a synthetic dataset +featuring diverse backgrounds, interfering foreground objects, diverse leak +locations, and precise segmentation ground truth. We propose a zero-shot method +that combines background subtraction, zero-shot object detection, filtering, +and segmentation to leverage this dataset. Experimental results indicate that +our approach significantly outperforms baseline methods based solely on +background subtraction and zero-shot object detection with segmentation, +reaching an IoU of 69\% overall. We also present an analysis of various prompt +configurations and threshold settings to provide deeper insights into the +performance of our method. The code and dataset will be released after +publication. + +
+
+
+
+
+ + ♻ ☆ Comparing Deep Neural Network for Multi-Label ECG Diagnosis From Scanned + ECG + + +
+ Automated ECG diagnosis has seen significant advancements with deep learning +techniques, but real-world applications still face challenges when dealing with +scanned paper ECGs. In this study, we explore multi-label classification of +ECGs extracted from scanned images, moving beyond traditional binary +classification (normal/abnormal). We evaluate the performance of multiple deep +neural network architectures, including AlexNet, VGG, ResNet, and Vision +Transformer, on scanned ECG datasets. Our comparative analysis examines model +accuracy, robustness to image artifacts, and generalizability across different +ECG conditions. Additionally, we investigate whether ECG signals extracted from +scanned images retain sufficient diagnostic information for reliable automated +classification. The findings highlight the strengths and limitations of each +architecture, providing insights into the feasibility of image-based ECG +diagnosis and its potential integration into clinical workflows. + +
+
+
+
+
+ + ♻ ☆ Human Motion Instruction Tuning CVPR 2025 + + +
+ This paper presents LLaMo (Large Language and Human Motion Assistant), a +multimodal framework for human motion instruction tuning. In contrast to +conventional instruction-tuning approaches that convert non-linguistic inputs, +such as video or motion sequences, into language tokens, LLaMo retains motion +in its native form for instruction tuning. This method preserves +motion-specific details that are often diminished in tokenization, thereby +improving the model's ability to interpret complex human behaviors. By +processing both video and motion data alongside textual inputs, LLaMo enables a +flexible, human-centric analysis. Experimental evaluations across +high-complexity domains, including human behaviors and professional activities, +indicate that LLaMo effectively captures domain-specific knowledge, enhancing +comprehension and prediction in motion-intensive scenarios. We hope LLaMo +offers a foundation for future multimodal AI systems with broad applications, +from sports analytics to behavioral prediction. Our code and models are +available on the project website: https://github.com/ILGLJ/LLaMo. + +
+
+ comment: Accepted by CVPR 2025 +
+
+
+
+
+ + ♻ ☆ DreamText: High Fidelity Scene Text Synthesis + + +
+ Scene text synthesis involves rendering specified texts onto arbitrary +images. Current methods typically formulate this task in an end-to-end manner +but lack effective character-level guidance during training. Besides, their +text encoders, pre-trained on a single font type, struggle to adapt to the +diverse font styles encountered in practical applications. Consequently, these +methods suffer from character distortion, repetition, and absence, particularly +in polystylistic scenarios. To this end, this paper proposes DreamText for +high-fidelity scene text synthesis. Our key idea is to reconstruct the +diffusion training process, introducing more refined guidance tailored to this +task, to expose and rectify the model's attention at the character level and +strengthen its learning of text regions. This transformation poses a hybrid +optimization challenge, involving both discrete and continuous variables. To +effectively tackle this challenge, we employ a heuristic alternate optimization +strategy. Meanwhile, we jointly train the text encoder and generator to +comprehensively learn and utilize the diverse font present in the training +dataset. This joint training is seamlessly integrated into the alternate +optimization process, fostering a synergistic relationship between learning +character embedding and re-estimating character attention. Specifically, in +each step, we first encode potential character-generated position information +from cross-attention maps into latent character masks. These masks are then +utilized to update the representation of specific characters in the current +step, which, in turn, enables the generator to correct the character's +attention in the subsequent steps. Both qualitative and quantitative results +demonstrate the superiority of our method to the state of the art. + +
+
+ comment: Code: https://github.com/CodeGoat24/DreamText, Project page: + https://codegoat24.github.io/DreamText/ +
+
+
+
+
+ + ♻ ☆ TractCloud-FOV: Deep Learning-based Robust Tractography Parcellation in + Diffusion MRI with Incomplete Field of View + + +
+ Tractography parcellation classifies streamlines reconstructed from diffusion +MRI into anatomically defined fiber tracts for clinical and research +applications. However, clinical scans often have incomplete fields of view +(FOV) where brain regions are partially imaged, leading to partial or truncated +fiber tracts. To address this challenge, we introduce TractCloud-FOV, a deep +learning framework that robustly parcellates tractography under conditions of +incomplete FOV. We propose a novel training strategy, FOV-Cut Augmentation +(FOV-CA), in which we synthetically cut tractograms to simulate a spectrum of +real-world inferior FOV cutoff scenarios. This data augmentation approach +enriches the training set with realistic truncated streamlines, enabling the +model to achieve superior generalization. We evaluate the proposed +TractCloud-FOV on both synthetically cut tractography and two real-life +datasets with incomplete FOV. TractCloud-FOV significantly outperforms several +state-of-the-art methods on all testing datasets in terms of streamline +classification accuracy, generalization ability, tract anatomical depiction, +and computational efficiency. Overall, TractCloud-FOV achieves efficient and +consistent tractography parcellation in diffusion MRI with incomplete FOV. + +
+
+
+
+
+ + ♻ ☆ Reasoning to Attend: Try to Understand How Token Works CVPR 2025 + + +
+ Current Large Multimodal Models (LMMs) empowered visual grounding typically +rely on $\texttt{}$ tokens as a text prompt to jointly optimize the +vision-language model (e.g., LLaVA) and the downstream task-specific model +(e.g., SAM). However, we observe that little research has looked into how it +works.In this work, we first visualize the similarity maps, which are obtained +by computing the semantic similarity between the $\texttt{}$ token and the +image token embeddings derived from the last hidden layer in both the LLaVA +encoder and SAM decoder. Intriguingly, we have found that a striking +consistency holds in terms of activation responses in the similarity map, which +reveals that what the $\texttt{}$ token contributes to is semantic +similarity within image-text pairs. Specifically, the $\texttt{}$ token, a +placeholder expanded in text vocabulary, extensively queries among individual +tokenized image patches to match the semantics of an object from text to the +paired image, while the Large Language Models (LLMs) are being fine-tuned. Upon +the above findings, we present READ, which facilitates LMMs' resilient +$\textbf{REA}$soning capability of where to atten$\textbf{D}$ under the +guidance of highly activated points borrowed from similarity maps. Remarkably, +READ features an intuitive design, Similarity as Points module (SasP), which +can be seamlessly applied to $\texttt{}$-like paradigms in a plug-and-play +fashion. Also, extensive experiments have been conducted on ReasonSeg and +RefCOCO(+/g) datasets. To validate whether READ suffers from catastrophic +forgetting of previous skills after fine-tuning, we further assess its +generation ability on an augmented FP-RefCOCO(+/g) dataset. All codes and +models are publicly available at https://github.com/rui-qian/READ. + +
+
+ comment: This work has been accepted to CVPR 2025, please refer to + https://github.com/rui-qian/READ +
+
+
+
+
+ + ♻ ☆ Visual Description Grounding Reduces Hallucinations and Boosts Reasoning + in LVLMs ICLR 2025 + + +
+ Large Vision-Language Models (LVLMs) often produce responses that misalign +with factual information, a phenomenon known as hallucinations. While +hallucinations are well-studied, the exact causes behind them remain +underexplored. In this paper, we first investigate the root causes of +hallucinations in LVLMs. Our findings reveal that existing mitigation +techniques primarily reduce hallucinations for visual recognition prompts-those +that require simple descriptions of visual elements-but fail for cognitive +prompts that demand deliberate reasoning. We identify the core issue as a lack +of true visual perception in LVLMs: although they can accurately recognize +visual elements, they struggle to fully interpret these elements in the context +of the input prompt and effectively link this recognition to their internal +knowledge, which is critical for reasoning. To address this gap, we introduce +Visual Description Grounded Decoding (VDGD), a simple, robust, and +training-free method designed to enhance visual perception and improve +reasoning capabilities in LVLMs. VDGD works by first generating a detailed +description of the image and appending it as a prefix to the instruction. +During response generation, tokens are sampled based on their KL divergence to +the description, favoring candidates with lower divergence. Experimental +results on multiple visual reasoning benchmarks and LVLMs demonstrate that VDGD +consistently outperforms existing baselines 2% - 33%. Finally, we introduce +VaLLu, a benchmark designed for comprehensive evaluation of the cognitive +capabilities of LVLMs. + +
+
+ comment: Accepted to ICLR 2025. Project: https://sreyan88.github.io/VDGD/ +
+
+
+
+
+ + ♻ ☆ Iterative Flow Matching -- Path Correction and Gradual Refinement for + Enhanced Generative Modeling + + +
+ Generative models for image generation are now commonly used for a wide +variety of applications, ranging from guided image generation for entertainment +to solving inverse problems. Nonetheless, training a generator is a non-trivial +feat that requires fine-tuning and can lead to so-called hallucinations, that +is, the generation of images that are unrealistic. In this work, we explore +image generation using flow matching. We explain and demonstrate why flow +matching can generate hallucinations, and propose an iterative process to +improve the generation process. Our iterative process can be integrated into +virtually $\textit{any}$ generative modeling technique, thereby enhancing the +performance and robustness of image synthesis systems. + +
+
+ comment: 17 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ LesionDiffusion: Towards Text-controlled General Lesion Synthesis + + +
+ Fully-supervised lesion recognition methods in medical imaging face +challenges due to the reliance on large annotated datasets, which are expensive +and difficult to collect. To address this, synthetic lesion generation has +become a promising approach. However, existing models struggle with +scalability, fine-grained control over lesion attributes, and the generation of +complex structures. We propose LesionDiffusion, a text-controllable lesion +synthesis framework for 3D CT imaging that generates both lesions and +corresponding masks. By utilizing a structured lesion report template, our +model provides greater control over lesion attributes and supports a wider +variety of lesion types. We introduce a dataset of 1,505 annotated CT scans +with paired lesion masks and structured reports, covering 14 lesion types +across 8 organs. LesionDiffusion consists of two components: a lesion mask +synthesis network (LMNet) and a lesion inpainting network (LINet), both guided +by lesion attributes and image features. Extensive experiments demonstrate that +LesionDiffusion significantly improves segmentation performance, with strong +generalization to unseen lesion types and organs, outperforming current +state-of-the-art models. Code will be available at +https://github.com/HengruiTianSJTU/LesionDiffusion. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Shazam: Unifying Multiple Foundation Models for Advanced Computational + Pathology + + +
+ Foundation Models (FMs) in computational pathology (CPath) have significantly +advanced the extraction of meaningful features from histopathology image +datasets, achieving strong performance across various clinical tasks. Despite +their impressive performance, these models often exhibit variability when +applied to different tasks, prompting the need for a unified framework capable +of consistently excelling across various applications. In this work, we propose +Shazam, a novel framework designed to efficiently combine multiple CPath +models. Unlike previous approaches that train a fixed-parameter FM, Shazam +dynamically extracts and refines information from diverse FMs for each specific +task. To ensure that each FM contributes effectively without dominance, a novel +distillation strategy is applied, guiding the student model with features from +all teacher models, which enhances its generalization ability. Experimental +results on two pathology patch classification datasets demonstrate that Shazam +outperforms existing CPath models and other fusion methods. Its lightweight, +flexible design makes it a promising solution for improving CPath analysis in +real-world settings. Code will be available at +https://github.com/Tuner12/Shazam. + +
+
+ comment: 9 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ DSPNet: Dual-vision Scene Perception for Robust 3D Question Answering CVPR 2025 + + +
+ 3D Question Answering (3D QA) requires the model to comprehensively +understand its situated 3D scene described by the text, then reason about its +surrounding environment and answer a question under that situation. However, +existing methods usually rely on global scene perception from pure 3D point +clouds and overlook the importance of rich local texture details from +multi-view images. Moreover, due to the inherent noise in camera poses and +complex occlusions, there exists significant feature degradation and reduced +feature robustness problems when aligning 3D point cloud with multi-view +images. In this paper, we propose a Dual-vision Scene Perception Network +(DSPNet), to comprehensively integrate multi-view and point cloud features to +improve robustness in 3D QA. Our Text-guided Multi-view Fusion (TGMF) module +prioritizes image views that closely match the semantic content of the text. To +adaptively fuse back-projected multi-view images with point cloud features, we +design the Adaptive Dual-vision Perception (ADVP) module, enhancing 3D scene +comprehension. Additionally, our Multimodal Context-guided Reasoning (MCGR) +module facilitates robust reasoning by integrating contextual information +across visual and linguistic modalities. Experimental results on SQA3D and +ScanQA datasets demonstrate the superiority of our DSPNet. Codes will be +available at https://github.com/LZ-CH/DSPNet. + +
+
+ comment: Accepted by CVPR 2025 +
+
+
+
+
+ + ♻ ☆ AutoBench-V: Can Large Vision-Language Models Benchmark Themselves? + + +
+ Large Vision-Language Models (LVLMs) have become essential for advancing the +integration of visual and linguistic information. However, the evaluation of +LVLMs presents significant challenges as the evaluation benchmark always +demands lots of human cost for its construction, and remains static, lacking +flexibility once constructed. Even though automatic evaluation has been +explored in textual modality, the visual modality remains under-explored. As a +result, in this work, we address a question: "Can LVLMs themselves be used to +benchmark each other in the visual automatically domain?". We introduce +AutoBench-V, an automated framework for serving evaluation on demand, i.e., +benchmarking LVLMs based on specific aspects of model capability. AutoBench-V +leverages text-to-image models to generate relevant image samples and then +utilizes LVLMs to orchestrate visual question-answering (VQA) tasks, completing +the evaluation process efficiently and flexibly. Through an extensive +evaluation of nine popular LVLMs across five demanded user inputs (i.e., +evaluation capabilities), the framework shows effectiveness and reliability. + +
+
+
+
+
+
+
+
+ + Artificial Intelligence 151 + +
+
+
+ + ☆ L$^2$M: Mutual Information Scaling Law for Long-Context Language + Modeling + + +
+ We rigorously establish a bipartite mutual information scaling law in natural +language that governs long-range dependencies. This scaling law, which we show +is distinct from and scales independently of the conventional two-point mutual +information, is the key to understanding long-context language modeling. Using +this scaling law, we formulate the Long-context Language Modeling (L$^2$M) +condition, which relates a model's capacity for effective long context length +modeling to the scaling of its latent state size for storing past information. +Our results are validated through experiments on both transformers and state +space models. This work establishes a theoretical foundation that guides the +development of large language models toward longer context lengths. + +
+
+ comment: 29 pages, 12 figures, 1 table +
+
+
+
+
+ + ☆ Shifting Long-Context LLMs Research from Input to Output + + +
+ Recent advancements in long-context Large Language Models (LLMs) have +primarily concentrated on processing extended input contexts, resulting in +significant strides in long-context comprehension. However, the equally +critical aspect of generating long-form outputs has received comparatively less +attention. This paper advocates for a paradigm shift in NLP research toward +addressing the challenges of long-output generation. Tasks such as novel +writing, long-term planning, and complex reasoning require models to understand +extensive contexts and produce coherent, contextually rich, and logically +consistent extended text. These demands highlight a critical gap in current LLM +capabilities. We underscore the importance of this under-explored domain and +call for focused efforts to develop foundational LLMs tailored for generating +high-quality, long-form outputs, which hold immense potential for real-world +applications. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Enough Coin Flips Can Make LLMs Act Bayesian + + +
+ Large language models (LLMs) exhibit the ability to generalize given few-shot +examples in their input prompt, an emergent capability known as in-context +learning (ICL). We investigate whether LLMs utilize ICL to perform structured +reasoning in ways that are consistent with a Bayesian framework or rely on +pattern matching. Using a controlled setting of biased coin flips, we find +that: (1) LLMs often possess biased priors, causing initial divergence in +zero-shot settings, (2) in-context evidence outweighs explicit bias +instructions, (3) LLMs broadly follow Bayesian posterior updates, with +deviations primarily due to miscalibrated priors rather than flawed updates, +and (4) attention magnitude has negligible effect on Bayesian inference. With +sufficient demonstrations of biased coin flips via ICL, LLMs update their +priors in a Bayesian manner. + +
+
+
+
+
+ + ☆ Predictable Scale: Part I -- Optimal Hyperparameter Scaling Law in Large + Language Model Pretraining + + +
+ The impressive capabilities of Large Language Models (LLMs) across diverse +tasks are now well-established, yet their effective deployment necessitates +careful hyperparameter optimization. Through extensive empirical studies +involving grid searches across diverse configurations, we discover universal +scaling laws governing these hyperparameters: optimal learning rate follows a +power-law relationship with both model parameters and data sizes, while optimal +batch size scales primarily with data sizes. Our analysis reveals a convex +optimization landscape for hyperparameters under fixed models and data size +conditions. This convexity implies an optimal hyperparameter plateau. We +contribute a universal, plug-and-play optimal hyperparameter tool for the +community. Its estimated values on the test set are merely 0.07\% away from the +globally optimal LLM performance found via an exhaustive search. These laws +demonstrate remarkable robustness across variations in model sparsity, training +data distribution, and model shape. To our best known, this is the first work +that unifies different model shapes and structures, such as Mixture-of-Experts +models and dense transformers, as well as establishes optimal hyperparameter +scaling laws across diverse data distributions. This exhaustive optimization +process demands substantial computational resources, utilizing nearly one +million NVIDIA H800 GPU hours to train 3,700 LLMs of varying sizes and +hyperparameters from scratch and consuming approximately 100 trillion tokens in +total. To facilitate reproducibility and further research, we will +progressively release all loss measurements and model checkpoints through our +designated repository https://step-law.github.io/ + +
+
+ comment: 19 pages +
+
+
+
+
+ + ☆ Scaling Rich Style-Prompted Text-to-Speech Datasets + + +
+ We introduce Paralinguistic Speech Captions (ParaSpeechCaps), a large-scale +dataset that annotates speech utterances with rich style captions. While rich +abstract tags (e.g. guttural, nasal, pained) have been explored in small-scale +human-annotated datasets, existing large-scale datasets only cover basic tags +(e.g. low-pitched, slow, loud). We combine off-the-shelf text and speech +embedders, classifiers and an audio language model to automatically scale rich +tag annotations for the first time. ParaSpeechCaps covers a total of 59 style +tags, including both speaker-level intrinsic tags and utterance-level +situational tags. It consists of 342 hours of human-labelled data (PSC-Base) +and 2427 hours of automatically annotated data (PSC-Scaled). We finetune +Parler-TTS, an open-source style-prompted TTS model, on ParaSpeechCaps, and +achieve improved style consistency (+7.9% Consistency MOS) and speech quality +(+15.5% Naturalness MOS) over the best performing baseline that combines +existing rich style tag datasets. We ablate several of our dataset design +choices to lay the foundation for future work in this space. Our dataset, +models and code are released at https://github.com/ajd12342/paraspeechcaps . + +
+
+
+
+
+ + ☆ Self-Supervised Models for Phoneme Recognition: Applications in + Children's Speech for Reading Learning + + +
+ Child speech recognition is still an underdeveloped area of research due to +the lack of data (especially on non-English languages) and the specific +difficulties of this task. Having explored various architectures for child +speech recognition in previous work, in this article we tackle recent +self-supervised models. We first compare wav2vec 2.0, HuBERT and WavLM models +adapted to phoneme recognition in French child speech, and continue our +experiments with the best of them, WavLM base+. We then further adapt it by +unfreezing its transformer blocks during fine-tuning on child speech, which +greatly improves its performance and makes it significantly outperform our base +model, a Transformer+CTC. Finally, we study in detail the behaviour of these +two models under the real conditions of our application, and show that WavLM +base+ is more robust to various reading tasks and noise levels. Index Terms: +speech recognition, child speech, self-supervised learning + +
+
+ comment: This paper was originally published in the Proceedings of Interspeech + 2024. DOI: 10.21437/Interspeech.2024-1095 +
+
+
+
+
+ + ☆ Universality of Layer-Level Entropy-Weighted Quantization Beyond Model + Architecture and Size + + +
+ We present a novel approach to selective model quantization that transcends +the limitations of architecture-specific and size-dependent compression methods +for Large Language Models (LLMs) using Entropy-Weighted Quantization (EWQ). By +analyzing the entropy distribution across transformer blocks, EWQ determines +which blocks can be safely quantized without causing significant performance +degradation, independent of model architecture or size. Our method outperforms +uniform quantization approaches, maintaining Massive Multitask Language +Understanding (MMLU) accuracy scores within 0.5% of unquantized models while +reducing memory usage by up to 18%. We demonstrate the effectiveness of EWQ +across multiple architectures-from 1.6B to 70B parameters-showcasing consistent +improvements in the quality-compression trade-off regardless of model scale or +architectural design. A surprising finding of EWQ is its ability to reduce +perplexity compared to unquantized models, suggesting the presence of +beneficial regularization through selective precision reduction. This +improvement holds across different model families, indicating a fundamental +relationship between layer-level entropy and optimal precision requirements. +Additionally, we introduce FastEWQ, a rapid method for entropy distribution +analysis that eliminates the need for loading model weights. This technique +leverages universal characteristics of entropy distribution that persist across +various architectures and scales, enabling near-instantaneous quantization +decisions while maintaining 80% classification accuracy with full entropy +analysis. Our results demonstrate that effective quantization strategies can be +developed independently of specific architectural choices or model sizes, +opening new possibilities for efficient LLM deployment. + +
+
+ comment: 29 pages, 7 figures, 14 tables; Comments are welcome +
+
+
+
+
+ + ☆ L1: Controlling How Long A Reasoning Model Thinks With Reinforcement + Learning + + +
+ Reasoning language models have shown an uncanny ability to improve +performance at test-time by ``thinking longer''-that is, by generating longer +chain-of-thought sequences and hence using more compute. However, the length of +their chain-of-thought reasoning is not controllable, making it impossible to +allocate test-time compute to achieve a desired level of performance. We +introduce Length Controlled Policy Optimization (LCPO), a simple reinforcement +learning method that optimizes for accuracy and adherence to user-specified +length constraints. We use LCPO to train L1, a reasoning language model that +produces outputs satisfying a length constraint given in its prompt. L1's +length control allows for smoothly trading off computational cost and accuracy +on a wide range of tasks, and outperforms the state-of-the-art S1 method for +length control. Furthermore, we uncover an unexpected short chain-of-thought +capability in models trained with LCPO. For instance, our 1.5B L1 model +surpasses GPT-4o at equal reasoning lengths. Overall, LCPO enables precise +control over reasoning length, allowing for fine-grained allocation of +test-time compute and accuracy. We release code and models at +https://www.cmu-l3.github.io/l1 + +
+
+
+
+
+ + ☆ Matrix Factorization for Inferring Associations and Missing Links + + +
+ Missing link prediction is a method for network analysis, with applications +in recommender systems, biology, social sciences, cybersecurity, information +retrieval, and Artificial Intelligence (AI) reasoning in Knowledge Graphs. +Missing link prediction identifies unseen but potentially existing connections +in a network by analyzing the observed patterns and relationships. In +proliferation detection, this supports efforts to identify and characterize +attempts by state and non-state actors to acquire nuclear weapons or associated +technology - a notoriously challenging but vital mission for global security. +Dimensionality reduction techniques like Non-Negative Matrix Factorization +(NMF) and Logistic Matrix Factorization (LMF) are effective but require +selection of the matrix rank parameter, that is, of the number of hidden +features, k, to avoid over/under-fitting. We introduce novel Weighted (WNMFk), +Boolean (BNMFk), and Recommender (RNMFk) matrix factorization methods, along +with ensemble variants incorporating logistic factorization, for link +prediction. Our methods integrate automatic model determination for rank +estimation by evaluating stability and accuracy using a modified bootstrap +methodology and uncertainty quantification (UQ), assessing prediction +reliability under random perturbations. We incorporate Otsu threshold selection +and k-means clustering for Boolean matrix factorization, comparing them to +coordinate descent-based Boolean thresholding. Our experiments highlight the +impact of rank k selection, evaluate model performance under varying test-set +sizes, and demonstrate the benefits of UQ for reliable predictions using +abstention. We validate our methods on three synthetic datasets (Boolean and +uniformly distributed) and benchmark them against LMF and symmetric LMF +(symLMF) on five real-world protein-protein interaction networks, showcasing an +improved prediction performance. + +
+
+ comment: 35 pages, 14 figures, 3 tables, 1 algorithm +
+
+
+
+
+ + ☆ Multi-Agent Inverse Q-Learning from Demonstrations ICRA + + +
+ When reward functions are hand-designed, deep reinforcement learning +algorithms often suffer from reward misspecification, causing them to learn +suboptimal policies in terms of the intended task objectives. In the +single-agent case, inverse reinforcement learning (IRL) techniques attempt to +address this issue by inferring the reward function from expert demonstrations. +However, in multi-agent problems, misalignment between the learned and true +objectives is exacerbated due to increased environment non-stationarity and +variance that scales with multiple agents. As such, in multi-agent general-sum +games, multi-agent IRL algorithms have difficulty balancing cooperative and +competitive objectives. To address these issues, we propose Multi-Agent +Marginal Q-Learning from Demonstrations (MAMQL), a novel sample-efficient +framework for multi-agent IRL. For each agent, MAMQL learns a critic +marginalized over the other agents' policies, allowing for a well-motivated use +of Boltzmann policies in the multi-agent context. We identify a connection +between optimal marginalized critics and single-agent soft-Q IRL, allowing us +to apply a direct, simple optimization criterion from the single-agent domain. +Across our experiments on three different simulated domains, MAMQL +significantly outperforms previous multi-agent methods in average reward, +sample efficiency, and reward recovery by often more than 2-5x. We make our +code available at https://sites.google.com/view/mamql . + +
+
+ comment: 8 pages, 4 figures, 2 tables. Published at the International + Conference on Robotics and Automation (ICRA) 2025 +
+
+
+
+
+ + ☆ Implicit Cross-Lingual Rewarding for Efficient Multilingual Preference + Alignment + + +
+ Direct Preference Optimization (DPO) has become a prominent method for +aligning Large Language Models (LLMs) with human preferences. While DPO has +enabled significant progress in aligning English LLMs, multilingual preference +alignment is hampered by data scarcity. To address this, we propose a novel +approach that $\textit{captures}$ learned preferences from well-aligned English +models by implicit rewards and $\textit{transfers}$ them to other languages +through iterative training. Specifically, we derive an implicit reward model +from the logits of an English DPO-aligned model and its corresponding reference +model. This reward model is then leveraged to annotate preference relations in +cross-lingual instruction-following pairs, using English instructions to +evaluate multilingual responses. The annotated data is subsequently used for +multilingual DPO fine-tuning, facilitating preference knowledge transfer from +English to other languages. Fine-tuning Llama3 for two iterations resulted in a +12.72% average improvement in Win Rate and a 5.97% increase in Length Control +Win Rate across all training languages on the X-AlpacaEval leaderboard. Our +findings demonstrate that leveraging existing English-aligned models can enable +efficient and effective multilingual preference alignment, significantly +reducing the need for extensive multilingual preference data. The code is +available at https://github.com/ZNLP/Implicit-Cross-Lingual-Rewarding + +
+
+ comment: Work in progress +
+
+
+
+
+ + Simulating the Real World: A Unified Survey of Multimodal Generative + Models + + +
+ Understanding and replicating the real world is a critical challenge in +Artificial General Intelligence (AGI) research. To achieve this, many existing +approaches, such as world models, aim to capture the fundamental principles +governing the physical world, enabling more accurate simulations and meaningful +interactions. However, current methods often treat different modalities, +including 2D (images), videos, 3D, and 4D representations, as independent +domains, overlooking their interdependencies. Additionally, these methods +typically focus on isolated dimensions of reality without systematically +integrating their connections. In this survey, we present a unified survey for +multimodal generative models that investigate the progression of data +dimensionality in real-world simulation. Specifically, this survey starts from +2D generation (appearance), then moves to video (appearance+dynamics) and 3D +generation (appearance+geometry), and finally culminates in 4D generation that +integrate all dimensions. To the best of our knowledge, this is the first +attempt to systematically unify the study of 2D, video, 3D and 4D generation +within a single framework. To guide future research, we provide a comprehensive +review of datasets, evaluation metrics and future directions, and fostering +insights for newcomers. This survey serves as a bridge to advance the study of +multimodal generative models and real-world simulation within a unified +framework. + +
+
+ comment: Repository for the related papers at + https://github.com/ALEEEHU/World-Simulator +
+
+
+
+
+ + ☆ Mark Your LLM: Detecting the Misuse of Open-Source Large Language Models + via Watermarking ICLR 2025 + + +
+ As open-source large language models (LLMs) like Llama3 become more capable, +it is crucial to develop watermarking techniques to detect their potential +misuse. Existing watermarking methods either add watermarks during LLM +inference, which is unsuitable for open-source LLMs, or primarily target +classification LLMs rather than recent generative LLMs. Adapting these +watermarks to open-source LLMs for misuse detection remains an open challenge. +This work defines two misuse scenarios for open-source LLMs: intellectual +property (IP) violation and LLM Usage Violation. Then, we explore the +application of inference-time watermark distillation and backdoor watermarking +in these contexts. We propose comprehensive evaluation methods to assess the +impact of various real-world further fine-tuning scenarios on watermarks and +the effect of these watermarks on LLM performance. Our experiments reveal that +backdoor watermarking could effectively detect IP Violation, while +inference-time watermark distillation is applicable in both scenarios but less +robust to further fine-tuning and has a more significant impact on LLM +performance compared to backdoor watermarking. Exploring more advanced +watermarking methods for open-source LLMs to detect their misuse should be an +important future direction. + +
+
+ comment: Accepted by the 1st Workshop on GenAI Watermarking, collocated with + ICLR 2025 +
+
+
+
+
+ + ☆ IDInit: A Universal and Stable Initialization Method for Neural Network + Training ICLR 2025 + + +
+ Deep neural networks have achieved remarkable accomplishments in practice. +The success of these networks hinges on effective initialization methods, which +are vital for ensuring stable and rapid convergence during training. Recently, +initialization methods that maintain identity transition within layers have +shown good efficiency in network training. These techniques (e.g., Fixup) set +specific weights to zero to achieve identity control. However, settings of +remaining weight (e.g., Fixup uses random values to initialize non-zero +weights) will affect the inductive bias that is achieved only by a zero weight, +which may be harmful to training. Addressing this concern, we introduce fully +identical initialization (IDInit), a novel method that preserves identity in +both the main and sub-stem layers of residual networks. IDInit employs a padded +identity-like matrix to overcome rank constraints in non-square weight +matrices. Furthermore, we show the convergence problem of an identity matrix +can be solved by stochastic gradient descent. Additionally, we enhance the +universality of IDInit by processing higher-order weights and addressing dead +neuron problems. IDInit is a straightforward yet effective initialization +method, with improved convergence, stability, and performance across various +settings, including large-scale datasets and deep models. + +
+
+ comment: Accepted in ICLR 2025 +
+
+
+
+
+ + ☆ The Best of Both Worlds: Integrating Language Models and Diffusion + Models for Video Generation + + +
+ Recent advancements in text-to-video (T2V) generation have been driven by two +competing paradigms: autoregressive language models and diffusion models. +However, each paradigm has intrinsic limitations: language models struggle with +visual quality and error accumulation, while diffusion models lack semantic +understanding and causal modeling. In this work, we propose LanDiff, a hybrid +framework that synergizes the strengths of both paradigms through +coarse-to-fine generation. Our architecture introduces three key innovations: +(1) a semantic tokenizer that compresses 3D visual features into compact 1D +discrete representations through efficient semantic compression, achieving a +$\sim$14,000$\times$ compression ratio; (2) a language model that generates +semantic tokens with high-level semantic relationships; (3) a streaming +diffusion model that refines coarse semantics into high-fidelity videos. +Experiments show that LanDiff, a 5B model, achieves a score of 85.43 on the +VBench T2V benchmark, surpassing the state-of-the-art open-source models +Hunyuan Video (13B) and other commercial models such as Sora, Keling, and +Hailuo. Furthermore, our model also achieves state-of-the-art performance in +long video generation, surpassing other open-source models in this field. Our +demo can be viewed at https://landiff.github.io/. + +
+
+
+
+
+ + ☆ HybridNorm: Towards Stable and Efficient Transformer Training via Hybrid + Normalization + + +
+ Transformers have become the de facto architecture for a wide range of +machine learning tasks, particularly in large language models (LLMs). Despite +their remarkable performance, challenges remain in training deep transformer +networks, especially regarding the location of layer normalization. While +Pre-Norm structures facilitate easier training due to their more prominent +identity path, they often yield suboptimal performance compared to Post-Norm. +In this paper, we propose $\textbf{HybridNorm}$, a straightforward yet +effective hybrid normalization strategy that integrates the advantages of both +Pre-Norm and Post-Norm approaches. Specifically, HybridNorm employs QKV +normalization within the attention mechanism and Post-Norm in the feed-forward +network (FFN) of each transformer block. This design not only stabilizes +training but also enhances performance, particularly in the context of LLMs. +Comprehensive experiments in both dense and sparse architectures show that +HybridNorm consistently outperforms both Pre-Norm and Post-Norm approaches, +achieving state-of-the-art results across various benchmarks. These findings +highlight the potential of HybridNorm as a more stable and effective technique +for improving the training and performance of deep transformer models. %Code +will be made publicly available. Code is available at +https://github.com/BryceZhuo/HybridNorm. + +
+
+
+
+
+ + ☆ The Next Frontier of LLM Applications: Open Ecosystems and Hardware + Synergy + + +
+ Large Language Model (LLM) applications, including LLM app stores and +autonomous agents, are shaping the future of AI ecosystems. However, platform +silos, fragmented hardware integration, and the absence of standardized +interfaces limit scalability, interoperability, and resource efficiency. While +LLM app stores democratize AI, their closed ecosystems restrict modular AI +reuse and cross-platform portability. Meanwhile, agent-based frameworks offer +flexibility but often lack seamless integration across diverse environments. +This paper envisions the future of LLM applications and proposes a three-layer +decoupled architecture grounded in software engineering principles such as +layered system design, service-oriented architectures, and hardware-software +co-design. This architecture separates application logic, communication +protocols, and hardware execution, enhancing modularity, efficiency, and +cross-platform compatibility. Beyond architecture, we highlight key security +and privacy challenges for safe, scalable AI deployment and outline research +directions in software and security engineering. This vision aims to foster +open, secure, and interoperable LLM ecosystems, guiding future advancements in +AI applications. + +
+
+
+
+
+ + ☆ ValuePilot: A Two-Phase Framework for Value-Driven Decision-Making + + +
+ Despite recent advances in artificial intelligence (AI), it poses challenges +to ensure personalized decision-making in tasks that are not considered in +training datasets. To address this issue, we propose ValuePilot, a two-phase +value-driven decision-making framework comprising a dataset generation toolkit +DGT and a decision-making module DMM trained on the generated data. DGT is +capable of generating scenarios based on value dimensions and closely mirroring +real-world tasks, with automated filtering techniques and human curation to +ensure the validity of the dataset. In the generated dataset, DMM learns to +recognize the inherent values of scenarios, computes action feasibility and +navigates the trade-offs between multiple value dimensions to make personalized +decisions. Extensive experiments demonstrate that, given human value +preferences, our DMM most closely aligns with human decisions, outperforming +Claude-3.5-Sonnet, Gemini-2-flash, Llama-3.1-405b and GPT-4o. This research is +a preliminary exploration of value-driven decision-making. We hope it will +stimulate interest in value-driven decision-making and personalized +decision-making within the community. + +
+
+
+
+
+ + ☆ Fundamental Limits of Hierarchical Secure Aggregation with Cyclic User + Association + + +
+ Secure aggregation is motivated by federated learning (FL) where a cloud +server aims to compute an averaged model (i.e., weights of deep neural +networks) of the locally-trained models of numerous clients, while adhering to +data security requirements. Hierarchical secure aggregation (HSA) extends this +concept to a three-layer network, where clustered users communicate with the +server through an intermediate layer of relays. In HSA, beyond conventional +server security, relay security is also enforced to ensure that the relays +remain oblivious to the users' inputs (an abstraction of the local models in +FL). Existing study on HSA assumes that each user is associated with only one +relay, limiting opportunities for coding across inter-cluster users to achieve +efficient communication and key generation. In this paper, we consider HSA with +a cyclic association pattern where each user is connected to $B$ consecutive +relays in a wrap-around manner. We propose an efficient aggregation scheme +which includes a message design for the inputs inspired by gradient coding-a +well-known technique for efficient communication in distributed computing-along +with a highly nontrivial security key design. We also derive novel converse +bounds on the minimum achievable communication and key rates using +information-theoretic arguments. + +
+
+
+
+
+ + ☆ Compositional Causal Reasoning Evaluation in Language Models + + +
+ Causal reasoning and compositional reasoning are two core aspirations in +generative AI. Measuring the extent of these behaviors requires principled +evaluation methods. We explore a unified perspective that considers both +behaviors simultaneously, termed compositional causal reasoning (CCR): the +ability to infer how causal measures compose and, equivalently, how causal +quantities propagate through graphs. We instantiate a framework for the +systematic evaluation of CCR for the average treatment effect and the +probability of necessity and sufficiency. As proof of concept, we demonstrate +the design of CCR tasks for language models in the LLama, Phi, and GPT +families. On a math word problem, our framework revealed a range of +taxonomically distinct error patterns. Additionally, CCR errors increased with +the complexity of causal paths for all models except o1. + +
+
+
+
+
+ + ☆ Benchmarking Reasoning Robustness in Large Language Models + + +
+ Despite the recent success of large language models (LLMs) in reasoning such +as DeepSeek, we for the first time identify a key dilemma in reasoning +robustness and generalization: significant performance degradation on novel or +incomplete data, suggesting a reliance on memorized patterns rather than +systematic reasoning. Our closer examination reveals four key unique +limitations underlying this issue:(1) Positional bias--models favor earlier +queries in multi-query inputs but answering the wrong one in the latter (e.g., +GPT-4o's accuracy drops from 75.8 percent to 72.8 percent); (2) Instruction +sensitivity--performance declines by 5.0 to 7.5 percent in the Qwen2.5 Series +and by 5.0 percent in DeepSeek-V3 with auxiliary guidance; (3) Numerical +fragility--value substitution sharply reduces accuracy (e.g., GPT-4o drops from +97.5 percent to 82.5 percent, GPT-o1-mini drops from 97.5 percent to 92.5 +percent); and (4) Memory dependence--models resort to guesswork when missing +critical data. These findings further highlight the reliance on heuristic +recall over rigorous logical inference, demonstrating challenges in reasoning +robustness. To comprehensively investigate these robustness challenges, this +paper introduces a novel benchmark, termed as Math-RoB, that exploits +hallucinations triggered by missing information to expose reasoning gaps. This +is achieved by an instruction-based approach to generate diverse datasets that +closely resemble training distributions, facilitating a holistic robustness +assessment and advancing the development of more robust reasoning frameworks. +Bad character(s) in field Abstract. + +
+
+
+
+
+ + ☆ Keeping Yourself is Important in Downstream Tuning Multimodal Large + Language Model + + +
+ Multi-modal Large Language Models (MLLMs) integrate visual and linguistic +reasoning to address complex tasks such as image captioning and visual question +answering. While MLLMs demonstrate remarkable versatility, MLLMs appears +limited performance on special applications. But tuning MLLMs for downstream +tasks encounters two key challenges: Task-Expert Specialization, where +distribution shifts between pre-training and target datasets constrain target +performance, and Open-World Stabilization, where catastrophic forgetting erases +the model general knowledge. In this work, we systematically review recent +advancements in MLLM tuning methodologies, classifying them into three +paradigms: (I) Selective Tuning, (II) Additive Tuning, and (III) +Reparameterization Tuning. Furthermore, we benchmark these tuning strategies +across popular MLLM architectures and diverse downstream tasks to establish +standardized evaluation analysis and systematic tuning principles. Finally, we +highlight several open challenges in this domain and propose future research +directions. To facilitate ongoing progress in this rapidly evolving field, we +provide a public repository that continuously tracks developments: +https://github.com/WenkeHuang/Awesome-MLLM-Tuning. + +
+
+
+
+
+ + ☆ SOLAR: Scalable Optimization of Large-scale Architecture for Reasoning + + +
+ Large Language Models (LLMs) excel in reasoning but remain constrained by +their Chain-of-Thought (CoT) approach, which struggles with complex tasks +requiring more nuanced topological reasoning. We introduce SOLAR, Scalable +Optimization of Large-scale Architecture for Reasoning, a framework that +dynamically optimizes various reasoning topologies to enhance accuracy and +efficiency. + Our Topological Annotation Generation (TAG) system automates topological +dataset creation and segmentation, improving post-training and evaluation. +Additionally, we propose Topological-Scaling, a reward-driven framework that +aligns training and inference scaling, equipping LLMs with adaptive, task-aware +reasoning. + SOLAR achieves substantial gains on MATH and GSM8K: +5% accuracy with +Topological Tuning, +9% with Topological Reward, and +10.02% with Hybrid +Scaling. It also reduces response length by over 5% for complex problems, +lowering inference latency. + To foster the reward system, we train a multi-task Topological Reward Model +(M-TRM), which autonomously selects the best reasoning topology and answer in a +single pass, eliminating the need for training and inference on multiple +single-task TRMs (S-TRMs), thus reducing both training cost and inference +latency. In addition, in terms of performance, M-TRM surpasses all S-TRMs, +improving accuracy by +10% and rank correlation by +9%. + To the best of our knowledge, SOLAR sets a new benchmark for scalable, +high-precision LLM reasoning while introducing an automated annotation process +and a dynamic reasoning topology competition mechanism. + +
+
+
+
+
+ + ☆ Dynamic Pricing for On-Demand DNN Inference in the Edge-AI Market + + +
+ The convergence of edge computing and AI gives rise to Edge-AI, which enables +the deployment of real-time AI applications and services at the network edge. +One of the fundamental research issues in Edge-AI is edge inference +acceleration, which aims to realize low-latency high-accuracy DNN inference +services by leveraging the fine-grained offloading of partitioned inference +tasks from end devices to edge servers. However, existing research has yet to +adopt a practical Edge-AI market perspective, which would systematically +explore the personalized inference needs of AI users (e.g., inference accuracy, +latency, and task complexity), the revenue incentives for AI service providers +that offer edge inference services, and multi-stakeholder governance within a +market-oriented context. To bridge this gap, we propose an Auction-based Edge +Inference Pricing Mechanism (AERIA) for revenue maximization to tackle the +multi-dimensional optimization problem of DNN model partition, edge inference +pricing, and resource allocation. We investigate the multi-exit device-edge +synergistic inference scheme for on-demand DNN inference acceleration, and +analyse the auction dynamics amongst the AI service providers, AI users and +edge infrastructure provider. Owing to the strategic mechanism design via +randomized consensus estimate and cost sharing techniques, the Edge-AI market +attains several desirable properties, including competitiveness in revenue +maximization, incentive compatibility, and envy-freeness, which are crucial to +maintain the effectiveness, truthfulness, and fairness of our auction outcomes. +The extensive simulation experiments based on four representative DNN inference +workloads demonstrate that our AERIA mechanism significantly outperforms +several state-of-the-art approaches in revenue maximization, demonstrating the +efficacy of AERIA for on-demand DNN inference in the Edge-AI market. + +
+
+ comment: Index Terms: Edge-AI, DNN Inference Offloading, Resource Management, + Dynamic Pricing, Auction Mechanism +
+
+
+
+
+ + ☆ STX-Search: Explanation Search for Continuous Dynamic Spatio-Temporal + Models + + +
+ Recent improvements in the expressive power of spatio-temporal models have +led to performance gains in many real-world applications, such as traffic +forecasting and social network modelling. However, understanding the +predictions from a model is crucial to ensure reliability and trustworthiness, +particularly for high-risk applications, such as healthcare and transport. Few +existing methods are able to generate explanations for models trained on +continuous-time dynamic graph data and, of these, the computational complexity +and lack of suitable explanation objectives pose challenges. In this paper, we +propose $\textbf{S}$patio-$\textbf{T}$emporal E$\textbf{X}$planation +$\textbf{Search}$ (STX-Search), a novel method for generating instance-level +explanations that is applicable to static and dynamic temporal graph +structures. We introduce a novel search strategy and objective function, to +find explanations that are highly faithful and interpretable. When compared +with existing methods, STX-Search produces explanations of higher fidelity +whilst optimising explanation size to maintain interpretability. + +
+
+
+
+
+ + ☆ Multi-modal Summarization in Model-Based Engineering: Automotive + Software Development Case Study + + +
+ Multimodal summarization integrating information from diverse data modalities +presents a promising solution to aid the understanding of information within +various processes. However, the application and advantages of multimodal +summarization have not received much attention in model-based engineering +(MBE), where it has become a cornerstone in the design and development of +complex systems, leveraging formal models to improve understanding, validation +and automation throughout the engineering lifecycle. UML and EMF diagrams in +model-based engineering contain a large amount of multimodal information and +intricate relational data. Hence, our study explores the application of +multimodal large language models within the domain of model-based engineering +to evaluate their capacity for understanding and identifying relationships, +features, and functionalities embedded in UML and EMF diagrams. We aim to +demonstrate the transformative potential benefits and limitations of multimodal +summarization in improving productivity and accuracy in MBE practices. The +proposed approach is evaluated within the context of automotive software +development, while many promising state-of-art models were taken into account. + +
+
+ comment: Conference paper accepted for IntelliSys2025 +
+
+
+
+
+ + ☆ Interpretable Transformation and Analysis of Timelines through Learning + via Surprisability + + +
+ The analysis of high-dimensional timeline data and the identification of +outliers and anomalies is critical across diverse domains, including sensor +readings, biological and medical data, historical records, and global +statistics. However, conventional analysis techniques often struggle with +challenges such as high dimensionality, complex distributions, and sparsity. +These limitations hinder the ability to extract meaningful insights from +complex temporal datasets, making it difficult to identify trending features, +outliers, and anomalies effectively. Inspired by surprisability -- a cognitive +science concept describing how humans instinctively focus on unexpected +deviations - we propose Learning via Surprisability (LvS), a novel approach for +transforming high-dimensional timeline data. LvS quantifies and prioritizes +anomalies in time-series data by formalizing deviations from expected behavior. +LvS bridges cognitive theories of attention with computational methods, +enabling the detection of anomalies and shifts in a way that preserves critical +context, offering a new lens for interpreting complex datasets. We demonstrate +the usefulness of LvS on three high-dimensional timeline use cases: a time +series of sensor data, a global dataset of mortality causes over multiple +years, and a textual corpus containing over two centuries of State of the Union +Addresses by U.S. presidents. Our results show that the LvS transformation +enables efficient and interpretable identification of outliers, anomalies, and +the most variable features along the timeline. + +
+
+
+
+
+ + ☆ ReynoldsFlow: Exquisite Flow Estimation via Reynolds Transport Theorem + + +
+ Optical flow is a fundamental technique for motion estimation, widely applied +in video stabilization, interpolation, and object tracking. Recent advancements +in artificial intelligence (AI) have enabled deep learning models to leverage +optical flow as an important feature for motion analysis. However, traditional +optical flow methods rely on restrictive assumptions, such as brightness +constancy and slow motion constraints, limiting their effectiveness in complex +scenes. Deep learning-based approaches require extensive training on large +domain-specific datasets, making them computationally demanding. Furthermore, +optical flow is typically visualized in the HSV color space, which introduces +nonlinear distortions when converted to RGB and is highly sensitive to noise, +degrading motion representation accuracy. These limitations inherently +constrain the performance of downstream models, potentially hindering object +tracking and motion analysis tasks. To address these challenges, we propose +Reynolds flow, a novel training-free flow estimation inspired by the Reynolds +transport theorem, offering a principled approach to modeling complex motion +dynamics. Beyond the conventional HSV-based visualization, denoted +ReynoldsFlow, we introduce an alternative representation, ReynoldsFlow+, +designed to improve flow visualization. We evaluate ReynoldsFlow and +ReynoldsFlow+ across three video-based benchmarks: tiny object detection on +UAVDB, infrared object detection on Anti-UAV, and pose estimation on GolfDB. +Experimental results demonstrate that networks trained with ReynoldsFlow+ +achieve state-of-the-art (SOTA) performance, exhibiting improved robustness and +efficiency across all tasks. + +
+
+ comment: 10 pages, 3 figures, 3 tables +
+
+
+
+
+ + ☆ Generalized Interpolating Discrete Diffusion + + +
+ While state-of-the-art language models achieve impressive results through +next-token prediction, they have inherent limitations such as the inability to +revise already generated tokens. This has prompted exploration of alternative +approaches such as discrete diffusion. However, masked diffusion, which has +emerged as a popular choice due to its simplicity and effectiveness, +reintroduces this inability to revise words. To overcome this, we generalize +masked diffusion and derive the theoretical backbone of a family of general +interpolating discrete diffusion (GIDD) processes offering greater flexibility +in the design of the noising processes. Leveraging a novel diffusion ELBO, we +achieve compute-matched state-of-the-art performance in diffusion language +modeling. Exploiting GIDD's flexibility, we explore a hybrid approach combining +masking and uniform noise, leading to improved sample quality and unlocking the +ability for the model to correct its own mistakes, an area where autoregressive +models notoriously have struggled. Our code and models are open-source: +https://github.com/dvruette/gidd/ + +
+
+
+
+
+ + ☆ ToolFuzz -- Automated Agent Tool Testing + + +
+ Large Language Model (LLM) Agents leverage the advanced reasoning +capabilities of LLMs in real-world applications. To interface with an +environment, these agents often rely on tools, such as web search or database +APIs. As the agent provides the LLM with tool documentation along the user +query, the completeness and correctness of this documentation is critical. +However, tool documentation is often over-, under-, or ill-specified, impeding +the agent's accuracy. Standard software testing approaches struggle to identify +these errors as they are expressed in natural language. Thus, despite its +importance, there currently exists no automated method to test the tool +documentation for agents. To address this issue, we present ToolFuzz, the first +method for automated testing of tool documentations. ToolFuzz is designed to +discover two types of errors: (1) user queries leading to tool runtime errors +and (2) user queries that lead to incorrect agent responses. ToolFuzz can +generate a large and diverse set of natural inputs, effectively finding tool +description errors at a low false positive rate. Further, we present two +straightforward prompt-engineering approaches. We evaluate all three tool +testing approaches on 32 common LangChain tools and 35 newly created custom +tools and 2 novel benchmarks to further strengthen the assessment. We find that +many publicly available tools suffer from underspecification. Specifically, we +show that ToolFuzz identifies 20x more erroneous inputs compared to the +prompt-engineering approaches, making it a key component for building reliable +AI agents. + +
+
+
+
+
+ + ☆ DAST: Difficulty-Adaptive Slow-Thinking for Large Reasoning Models + + +
+ Recent advancements in slow-thinking reasoning models have shown exceptional +performance in complex reasoning tasks. However, these models often exhibit +overthinking-generating redundant reasoning steps for simple problems, leading +to excessive computational resource usage. While current mitigation strategies +uniformly reduce reasoning tokens, they risk degrading performance on +challenging tasks that require extended reasoning. This paper introduces +Difficulty-Adaptive Slow-Thinking (DAST), a novel framework that enables models +to autonomously adjust the length of Chain-of-Thought(CoT) based on problem +difficulty. We first propose a Token Length Budget (TLB) metric to quantify +difficulty, then leveraging length-aware reward shaping and length preference +optimization to implement DAST. DAST penalizes overlong responses for simple +tasks while incentivizing sufficient reasoning for complex problems. +Experiments on diverse datasets and model scales demonstrate that DAST +effectively mitigates overthinking (reducing token usage by over 30\% on +average) while preserving reasoning accuracy on complex problems. + +
+
+ comment: working in progress +
+
+
+
+
+ + ☆ TPC: Cross-Temporal Prediction Connection for Vision-Language Model + Hallucination Reduction + + +
+ Vision-language models (VLMs) have achieved remarkable advancements, +capitalizing on the impressive capabilities of large language models (LLMs) +across diverse tasks. Despite this, a critical challenge known as hallucination +occurs when models overconfidently describe objects or attributes absent from +the image, a problem exacerbated by the tendency of VLMs to rely on linguistic +priors. This limitation reduces model reliability in high-stakes applications. +In this work, we have observed the characteristic of logits' continuity +consistency enhancement and introduced a straightforward and efficient method, +Cross-Temporal Prediction Connection (TPC), designed to enhance the semantic +consistency of logits by connecting them temporally across timesteps. TPC +amplifies information flow and improves coherence, effectively reducing +hallucination. Extensive experiments show that TPC surpasses existing +representatives, delivering superior performance in both accuracy and +efficiency while maintaining robustness in open-ended text generation tasks. + +
+
+
+
+
+ + ☆ Privacy Preserving and Robust Aggregation for Cross-Silo Federated + Learning in Non-IID Settings + + +
+ Federated Averaging remains the most widely used aggregation strategy in +federated learning due to its simplicity and scalability. However, its +performance degrades significantly in non-IID data settings, where client +distributions are highly imbalanced or skewed. Additionally, it relies on +clients transmitting metadata, specifically the number of training samples, +which introduces privacy risks and may conflict with regulatory frameworks like +the European GDPR. In this paper, we propose a novel aggregation strategy that +addresses these challenges by introducing class-aware gradient masking. Unlike +traditional approaches, our method relies solely on gradient updates, +eliminating the need for any additional client metadata, thereby enhancing +privacy protection. Furthermore, our approach validates and dynamically weights +client contributions based on class-specific importance, ensuring robustness +against non-IID distributions, convergence prevention, and backdoor attacks. +Extensive experiments on benchmark datasets demonstrate that our method not +only outperforms FedAvg and other widely accepted aggregation strategies in +non-IID settings but also preserves model integrity in adversarial scenarios. +Our results establish the effectiveness of gradient masking as a practical and +secure solution for federated learning. + +
+
+
+
+
+ + ☆ Activation Space Interventions Can Be Transferred Between Large Language + Models + + +
+ The study of representation universality in AI models reveals growing +convergence across domains, modalities, and architectures. However, the +practical applications of representation universality remain largely +unexplored. We bridge this gap by demonstrating that safety interventions can +be transferred between models through learned mappings of their shared +activation spaces. We demonstrate this approach on two well-established AI +safety tasks: backdoor removal and refusal of harmful prompts, showing +successful transfer of steering vectors that alter the models' outputs in a +predictable way. Additionally, we propose a new task, \textit{corrupted +capabilities}, where models are fine-tuned to embed knowledge tied to a +backdoor. This tests their ability to separate useful skills from backdoors, +reflecting real-world challenges. Extensive experiments across Llama, Qwen and +Gemma model families show that our method enables using smaller models to +efficiently align larger ones. Furthermore, we demonstrate that autoencoder +mappings between base and fine-tuned models can serve as reliable ``lightweight +safety switches", allowing dynamic toggling between model behaviors. + +
+
+ comment: 68 pages +
+
+
+
+
+ + ☆ PDX: A Data Layout for Vector Similarity Search SIGMOD '25 + + +
+ We propose Partition Dimensions Across (PDX), a data layout for vectors +(e.g., embeddings) that, similar to PAX [6], stores multiple vectors in one +block, using a vertical layout for the dimensions (Figure 1). PDX accelerates +exact and approximate similarity search thanks to its dimension-by-dimension +search strategy that operates on multiple-vectors-at-a-time in tight loops. It +beats SIMD-optimized distance kernels on standard horizontal vector storage +(avg 40% faster), only relying on scalar code that gets auto-vectorized. We +combined the PDX layout with recent dimension-pruning algorithms ADSampling +[19] and BSA [52] that accelerate approximate vector search. We found that +these algorithms on the horizontal vector layout can lose to SIMD-optimized +linear scans, even if they are SIMD-optimized. However, when used on PDX, their +benefit is restored to 2-7x. We find that search on PDX is especially fast if a +limited number of dimensions has to be scanned fully, which is what the +dimension-pruning approaches do. We finally introduce PDX-BOND, an even more +flexible dimension-pruning strategy, with good performance on exact search and +reasonable performance on approximate search. Unlike previous pruning +algorithms, it can work on vector data "as-is" without preprocessing; making it +attractive for vector databases with frequent updates. + +
+
+ comment: To be published in Proceedings of The 2025 International Conference + on Management of Data (SIGMOD '25). For associated code, see + https://github.com/cwida/PDX +
+
+
+
+
+ + ☆ From Idea to CAD: A Language Model-Driven Multi-Agent System for + Collaborative Design + + +
+ Creating digital models using Computer Aided Design (CAD) is a process that +requires in-depth expertise. In industrial product development, this process +typically involves entire teams of engineers, spanning requirements +engineering, CAD itself, and quality assurance. We present an approach that +mirrors this team structure with a Vision Language Model (VLM)-based Multi +Agent System, with access to parametric CAD tooling and tool documentation. +Combining agents for requirements engineering, CAD engineering, and +vision-based quality assurance, a model is generated automatically from +sketches and/ or textual descriptions. The resulting model can be refined +collaboratively in an iterative validation loop with the user. Our approach has +the potential to increase the effectiveness of design processes, both for +industry experts and for hobbyists who create models for 3D printing. We +demonstrate the potential of the architecture at the example of various design +tasks and provide several ablations that show the benefits of the +architecture's individual components. + +
+
+ comment: 11 pages, 3 figures +
+
+
+
+
+ + ☆ Learning Transformer-based World Models with Contrastive Predictive + Coding + + +
+ The DreamerV3 algorithm recently obtained remarkable performance across +diverse environment domains by learning an accurate world model based on +Recurrent Neural Networks (RNNs). Following the success of model-based +reinforcement learning algorithms and the rapid adoption of the Transformer +architecture for its superior training efficiency and favorable scaling +properties, recent works such as STORM have proposed replacing RNN-based world +models with Transformer-based world models using masked self-attention. +However, despite the improved training efficiency of these methods, their +impact on performance remains limited compared to the Dreamer algorithm, +struggling to learn competitive Transformer-based world models. In this work, +we show that the next state prediction objective adopted in previous approaches +is insufficient to fully exploit the representation capabilities of +Transformers. We propose to extend world model predictions to longer time +horizons by introducing TWISTER (Transformer-based World model wIth contraSTivE +Representations), a world model using action-conditioned Contrastive Predictive +Coding to learn high-level temporal feature representations and improve the +agent performance. TWISTER achieves a human-normalized mean score of 162% on +the Atari 100k benchmark, setting a new record among state-of-the-art methods +that do not employ look-ahead search. + +
+
+
+
+
+ + ☆ Wider or Deeper? Scaling LLM Inference-Time Compute with Adaptive + Branching Tree Search ICLR 2025 + + +
+ Recent advances demonstrate that increasing inference-time computation can +significantly boost the reasoning capabilities of large language models (LLMs). +Although repeated sampling (i.e., generating multiple candidate outputs) is a +highly effective strategy, it does not leverage external feedback signals for +refinement, which are often available in tasks like coding. In this work, we +propose $\textit{Adaptive Branching Monte Carlo Tree Search (AB-MCTS)}$, a +novel inference-time framework that generalizes repeated sampling with +principled multi-turn exploration and exploitation. At each node in the search +tree, AB-MCTS dynamically decides whether to "go wider" by expanding new +candidate responses or "go deeper" by revisiting existing ones based on +external feedback signals. We evaluate our method on complex coding and +engineering tasks using frontier models. Empirical results show that AB-MCTS +consistently outperforms both repeated sampling and standard MCTS, underscoring +the importance of combining the response diversity of LLMs with multi-turn +solution refinement for effective inference-time scaling. + +
+
+ comment: To appear at ICLR 2025 Workshop on Foundation Models in the Wild +
+
+
+
+
+ + ☆ Training-Free Graph Filtering via Multimodal Feature Refinement for + Extremely Fast Multimodal Recommendation + + +
+ Multimodal recommender systems improve the performance of canonical +recommender systems with no item features by utilizing diverse content types +such as text, images, and videos, while alleviating inherent sparsity of +user-item interactions and accelerating user engagement. However, current +neural network-based models often incur significant computational overhead due +to the complex training process required to learn and integrate information +from multiple modalities. To overcome this limitation, we propose +MultiModal-Graph Filtering (MM-GF), a training-free method based on the notion +of graph filtering (GF) for efficient and accurate multimodal recommendations. +Specifically, MM-GF first constructs multiple similarity graphs through +nontrivial multimodal feature refinement such as robust scaling and vector +shifting by addressing the heterogeneous characteristics across modalities. +Then, MM-GF optimally fuses multimodal information using linear low-pass +filters across different modalities. Extensive experiments on real-world +benchmark datasets demonstrate that MM-GF not only improves recommendation +accuracy by up to 13.35% compared to the best competitor but also dramatically +reduces computational costs by achieving the runtime of less than 10 seconds. + +
+
+ comment: 10 pages, 6 figures, 6 tables +
+
+
+
+
+ + ☆ Speculative MoE: Communication Efficient Parallel MoE Inference with + Speculative Token and Expert Pre-scheduling + + +
+ MoE (Mixture of Experts) prevails as a neural architecture that can scale +modern transformer-based LLMs (Large Language Models) to unprecedented scales. +Nevertheless, large MoEs' great demands of computing power, memory capacity and +memory bandwidth make scalable serving a fundamental challenge and efficient +parallel inference has become a requisite to attain adequate throughput under +latency constraints. DeepSpeed-MoE, one state-of-the-art MoE inference +framework, adopts a 3D-parallel paradigm including EP (Expert Parallelism), TP +(Tensor Parallel) and DP (Data Parallelism). However, our analysis shows +DeepSpeed-MoE's inference efficiency is largely bottlenecked by EP, which is +implemented with costly all-to-all collectives to route token activation. Our +work aims to boost DeepSpeed-MoE by strategically reducing EP's communication +overhead with a technique named Speculative MoE. Speculative MoE has two +speculative parallelization schemes, speculative token shuffling and +speculative expert grouping, which predict outstanding tokens' expert routing +paths and pre-schedule tokens and experts across devices to losslessly trim +EP's communication volume. Besides DeepSpeed-MoE, we also build Speculative MoE +into a prevailing MoE inference engine SGLang. Experiments show Speculative MoE +can significantly boost state-of-the-art MoE inference frameworks on fast +homogeneous and slow heterogeneous interconnects. + +
+
+
+
+
+ + ☆ AgentSafe: Safeguarding Large Language Model-based Multi-agent Systems + via Hierarchical Data Management + + +
+ Large Language Model based multi-agent systems are revolutionizing autonomous +communication and collaboration, yet they remain vulnerable to security threats +like unauthorized access and data breaches. To address this, we introduce +AgentSafe, a novel framework that enhances MAS security through hierarchical +information management and memory protection. AgentSafe classifies information +by security levels, restricting sensitive data access to authorized agents. +AgentSafe incorporates two components: ThreatSieve, which secures communication +by verifying information authority and preventing impersonation, and +HierarCache, an adaptive memory management system that defends against +unauthorized access and malicious poisoning, representing the first systematic +defense for agent memory. Experiments across various LLMs show that AgentSafe +significantly boosts system resilience, achieving defense success rates above +80% under adversarial conditions. Additionally, AgentSafe demonstrates +scalability, maintaining robust performance as agent numbers and information +complexity grow. Results underscore effectiveness of AgentSafe in securing MAS +and its potential for real-world application. + +
+
+
+
+
+ + ☆ Dedicated Feedback and Edit Models Empower Inference-Time Scaling for + Open-Ended General-Domain Tasks + + +
+ Inference-Time Scaling has been critical to the success of recent models such +as OpenAI o1 and DeepSeek R1. However, many techniques used to train models for +inference-time scaling require tasks to have answers that can be verified, +limiting their application to domains such as math, coding and logical +reasoning. We take inspiration from how humans make first attempts, ask for +detailed feedback from others and make improvements based on such feedback +across a wide spectrum of open-ended endeavors. To this end, we collect data +for and train dedicated Feedback and Edit Models that are capable of performing +inference-time scaling for open-ended general-domain tasks. In our setup, one +model generates an initial response, which are given feedback by a second +model, that are then used by a third model to edit the response. We show that +performance on Arena Hard, a benchmark strongly predictive of Chatbot Arena Elo +can be boosted by scaling the number of initial response drafts, effective +feedback and edited responses. When scaled optimally, our setup based on 70B +models from the Llama 3 family can reach SoTA performance on Arena Hard at 92.7 +as of 5 Mar 2025, surpassing OpenAI o1-preview-2024-09-12 with 90.4 and +DeepSeek R1 with 92.3. + +
+
+ comment: 22 pages, 2 figures +
+
+
+
+
+ + ☆ Causally Reliable Concept Bottleneck Models + + +
+ Concept-based models are an emerging paradigm in deep learning that +constrains the inference process to operate through human-interpretable +concepts, facilitating explainability and human interaction. However, these +architectures, on par with popular opaque neural models, fail to account for +the true causal mechanisms underlying the target phenomena represented in the +data. This hampers their ability to support causal reasoning tasks, limits +out-of-distribution generalization, and hinders the implementation of fairness +constraints. To overcome these issues, we propose \emph{Causally reliable +Concept Bottleneck Models} (C$^2$BMs), a class of concept-based architectures +that enforce reasoning through a bottleneck of concepts structured according to +a model of the real-world causal mechanisms. We also introduce a pipeline to +automatically learn this structure from observational data and +\emph{unstructured} background knowledge (e.g., scientific literature). +Experimental evidence suggest that C$^2$BM are more interpretable, causally +reliable, and improve responsiveness to interventions w.r.t. standard opaque +and concept-based models, while maintaining their accuracy. + +
+
+
+
+
+ + ☆ A Generalist Cross-Domain Molecular Learning Framework for + Structure-Based Drug Discovery + + +
+ Structure-based drug discovery (SBDD) is a systematic scientific process that +develops new drugs by leveraging the detailed physical structure of the target +protein. Recent advancements in pre-trained models for biomolecules have +demonstrated remarkable success across various biochemical applications, +including drug discovery and protein engineering. However, in most approaches, +the pre-trained models primarily focus on the characteristics of either small +molecules or proteins, without delving into their binding interactions which +are essential cross-domain relationships pivotal to SBDD. To fill this gap, we +propose a general-purpose foundation model named BIT (an abbreviation for +Biomolecular Interaction Transformer), which is capable of encoding a range of +biochemical entities, including small molecules, proteins, and protein-ligand +complexes, as well as various data formats, encompassing both 2D and 3D +structures. Specifically, we introduce Mixture-of-Domain-Experts (MoDE) to +handle the biomolecules from diverse biochemical domains and +Mixture-of-Structure-Experts (MoSE) to capture positional dependencies in the +molecular structures. The proposed mixture-of-experts approach enables BIT to +achieve both deep fusion and domain-specific encoding, effectively capturing +fine-grained molecular interactions within protein-ligand complexes. Then, we +perform cross-domain pre-training on the shared Transformer backbone via +several unified self-supervised denoising tasks. Experimental results on +various benchmarks demonstrate that BIT achieves exceptional performance in +downstream tasks, including binding affinity prediction, structure-based +virtual screening, and molecular property prediction. + +
+
+
+
+
+ + scDD: Latent Codes Based scRNA-seq Dataset Distillation with Foundation + Model Knowledge + + +
+ Single-cell RNA sequencing (scRNA-seq) technology has profiled hundreds of +millions of human cells across organs, diseases, development and perturbations +to date. However, the high-dimensional sparsity, batch effect noise, category +imbalance, and ever-increasing data scale of the original sequencing data pose +significant challenges for multi-center knowledge transfer, data fusion, and +cross-validation between scRNA-seq datasets. To address these barriers, (1) we +first propose a latent codes-based scRNA-seq dataset distillation framework +named scDD, which transfers and distills foundation model knowledge and +original dataset information into a compact latent space and generates +synthetic scRNA-seq dataset by a generator to replace the original dataset. +Then, (2) we propose a single-step conditional diffusion generator named SCDG, +which perform single-step gradient back-propagation to help scDD optimize +distillation quality and avoid gradient decay caused by multi-step +back-propagation. Meanwhile, SCDG ensures the scRNA-seq data characteristics +and inter-class discriminability of the synthetic dataset through flexible +conditional control and generation quality assurance. Finally, we propose a +comprehensive benchmark to evaluate the performance of scRNA-seq dataset +distillation in different data analysis tasks. It is validated that our +proposed method can achieve 7.61% absolute and 15.70% relative improvement over +previous state-of-the-art methods on average task. + +
+
+
+
+
+ + ☆ Talking Back -- human input and explanations to interactive AI systems + + +
+ While XAI focuses on providing AI explanations to humans, can the reverse - +humans explaining their judgments to AI - foster richer, synergistic human-AI +systems? This paper explores various forms of human inputs to AI and examines +how human explanations can guide machine learning models toward automated +judgments and explanations that align more closely with human concepts. + +
+
+
+
+
+ + ☆ Solving Word-Sense Disambiguation and Word-Sense Induction with + Dictionary Examples + + +
+ Many less-resourced languages struggle with a lack of large, task-specific +datasets that are required for solving relevant tasks with modern +transformer-based large language models (LLMs). On the other hand, many +linguistic resources, such as dictionaries, are rarely used in this context +despite their large information contents. We show how LLMs can be used to +extend existing language resources in less-resourced languages for two +important tasks: word-sense disambiguation (WSD) and word-sense induction +(WSI). We approach the two tasks through the related but much more accessible +word-in-context (WiC) task where, given a pair of sentences and a target word, +a classification model is tasked with predicting whether the sense of a given +word differs between sentences. We demonstrate that a well-trained model for +this task can distinguish between different word senses and can be adapted to +solve the WSD and WSI tasks. The advantage of using the WiC task, instead of +directly predicting senses, is that the WiC task does not need pre-constructed +sense inventories with a sufficient number of examples for each sense, which +are rarely available in less-resourced languages. We show that sentence pairs +for the WiC task can be successfully generated from dictionary examples using +LLMs. The resulting prediction models outperform existing models on WiC, WSD, +and WSI tasks. We demonstrate our methodology on the Slovene language, where a +monolingual dictionary is available, but word-sense resources are tiny. + +
+
+ comment: 12 pages, 1 figure +
+
+
+
+
+ + ☆ Provable Robust Overfitting Mitigation in Wasserstein Distributionally + Robust Optimization + + +
+ Wasserstein distributionally robust optimization (WDRO) optimizes against +worst-case distributional shifts within a specified uncertainty set, leading to +enhanced generalization on unseen adversarial examples, compared to standard +adversarial training which focuses on pointwise adversarial perturbations. +However, WDRO still suffers fundamentally from the robust overfitting problem, +as it does not consider statistical error. We address this gap by proposing a +novel robust optimization framework under a new uncertainty set for adversarial +noise via Wasserstein distance and statistical error via Kullback-Leibler +divergence, called the Statistically Robust WDRO. We establish a robust +generalization bound for the new optimization framework, implying that +out-of-distribution adversarial performance is at least as good as the +statistically robust training loss with high probability. Furthermore, we +derive conditions under which Stackelberg and Nash equilibria exist between the +learner and the adversary, giving an optimal robust model in certain sense. +Finally, through extensive experiments, we demonstrate that our method +significantly mitigates robust overfitting and enhances robustness within the +framework of WDRO. + +
+
+
+
+
+ + ☆ Malware Detection at the Edge with Lightweight LLMs: A Performance + Evaluation + + +
+ The rapid evolution of malware attacks calls for the development of +innovative detection methods, especially in resource-constrained edge +computing. Traditional detection techniques struggle to keep up with modern +malware's sophistication and adaptability, prompting a shift towards advanced +methodologies like those leveraging Large Language Models (LLMs) for enhanced +malware detection. However, deploying LLMs for malware detection directly at +edge devices raises several challenges, including ensuring accuracy in +constrained environments and addressing edge devices' energy and computational +limits. To tackle these challenges, this paper proposes an architecture +leveraging lightweight LLMs' strengths while addressing limitations like +reduced accuracy and insufficient computational power. To evaluate the +effectiveness of the proposed lightweight LLM-based approach for edge +computing, we perform an extensive experimental evaluation using several +state-of-the-art lightweight LLMs. We test them with several publicly available +datasets specifically designed for edge and IoT scenarios and different edge +nodes with varying computational power and characteristics. + +
+
+
+
+
+ + ☆ Mapping AI Benchmark Data to Quantitative Risk Estimates Through Expert + Elicitation + + +
+ The literature and multiple experts point to many potential risks from large +language models (LLMs), but there are still very few direct measurements of the +actual harms posed. AI risk assessment has so far focused on measuring the +models' capabilities, but the capabilities of models are only indicators of +risk, not measures of risk. Better modeling and quantification of AI risk +scenarios can help bridge this disconnect and link the capabilities of LLMs to +tangible real-world harm. This paper makes an early contribution to this field +by demonstrating how existing AI benchmarks can be used to facilitate the +creation of risk estimates. We describe the results of a pilot study in which +experts use information from Cybench, an AI benchmark, to generate probability +estimates. We show that the methodology seems promising for this purpose, while +noting improvements that can be made to further strengthen its application in +quantitative AI risk assessment. + +
+
+ comment: 23 pages, 4 figures +
+
+
+
+
+ + ☆ MathMistake Checker: A Comprehensive Demonstration for Step-by-Step Math + Problem Mistake Finding by Prompt-Guided LLMs AAAI 2025 + + +
+ We propose a novel system, MathMistake Checker, designed to automate +step-by-step mistake finding in mathematical problems with lengthy answers +through a two-stage process. The system aims to simplify grading, increase +efficiency, and enhance learning experiences from a pedagogical perspective. It +integrates advanced technologies, including computer vision and the +chain-of-thought capabilities of the latest large language models (LLMs). Our +system supports open-ended grading without reference answers and promotes +personalized learning by providing targeted feedback. We demonstrate its +effectiveness across various types of math problems, such as calculation and +word problems. + +
+
+ comment: Published in AAAI 2025 +
+
+
+
+
+ + ☆ How Do Hackathons Foster Creativity? Towards AI Collaborative Evaluation + of Creativity at Scale + + +
+ Hackathons have become popular collaborative events for accelerating the +development of creative ideas and prototypes. There are several case studies +showcasing creative outcomes across domains such as industry, education, and +research. However, there are no large-scale studies on creativity in hackathons +which can advance theory on how hackathon formats lead to creative outcomes. We +conducted a computational analysis of 193,353 hackathon projects. By +operationalizing creativity through usefulness and novelty, we refined our +dataset to 10,363 projects, allowing us to analyze how participant +characteristics, collaboration patterns, and hackathon setups influence the +development of creative projects. The contribution of our paper is twofold: We +identified means for organizers to foster creativity in hackathons. We also +explore the use of large language models (LLMs) to augment the evaluation of +creative outcomes and discuss challenges and opportunities of doing this, which +has implications for creativity research at large. + +
+
+ comment: Accepted in Proceedings of the 2025 CHI Conference on Human Factors + in Computing Systems +
+
+
+
+
+ + ☆ Explainable AI in Time-Sensitive Scenarios: Prefetched Offline + Explanation Model + + +
+ As predictive machine learning models become increasingly adopted and +advanced, their role has evolved from merely predicting outcomes to actively +shaping them. This evolution has underscored the importance of Trustworthy AI, +highlighting the necessity to extend our focus beyond mere accuracy and toward +a comprehensive understanding of these models' behaviors within the specific +contexts of their applications. To further progress in explainability, we +introduce Poem, Prefetched Offline Explanation Model, a model-agnostic, local +explainability algorithm for image data. The algorithm generates exemplars, +counterexemplars and saliency maps to provide quick and effective explanations +suitable for time-sensitive scenarios. Leveraging an existing local algorithm, +\poem{} infers factual and counterfactual rules from data to create +illustrative examples and opposite scenarios with an enhanced stability by +design. A novel mechanism then matches incoming test points with an explanation +base and produces diverse exemplars, informative saliency maps and believable +counterexemplars. Experimental results indicate that Poem outperforms its +predecessor Abele in speed and ability to generate more nuanced and varied +exemplars alongside more insightful saliency maps and valuable +counterexemplars. + +
+
+
+
+
+ + ☆ Towards Autonomous Reinforcement Learning for Real-World Robotic + Manipulation with Large Language Models + + +
+ Recent advancements in Large Language Models (LLMs) and Visual Language +Models (VLMs) have significantly impacted robotics, enabling high-level +semantic motion planning applications. Reinforcement Learning (RL), a +complementary paradigm, enables agents to autonomously optimize complex +behaviors through interaction and reward signals. However, designing effective +reward functions for RL remains challenging, especially in real-world tasks +where sparse rewards are insufficient and dense rewards require elaborate +design. In this work, we propose Autonomous Reinforcement learning for Complex +HumanInformed Environments (ARCHIE), an unsupervised pipeline leveraging GPT-4, +a pre-trained LLM, to generate reward functions directly from natural language +task descriptions. The rewards are used to train RL agents in simulated +environments, where we formalize the reward generation process to enhance +feasibility. Additionally, GPT-4 automates the coding of task success criteria, +creating a fully automated, one-shot procedure for translating human-readable +text into deployable robot skills. Our approach is validated through extensive +simulated experiments on single-arm and bi-manual manipulation tasks using an +ABB YuMi collaborative robot, highlighting its practicality and effectiveness. +Tasks are demonstrated on the real robot setup. + +
+
+
+
+
+ + ☆ Prompt Programming: A Platform for Dialogue-based Computational Problem + Solving with Generative AI Models + + +
+ Computing students increasingly rely on generative AI tools for programming +assistance, often without formal instruction or guidance. This highlights a +need to teach students how to effectively interact with AI models, particularly +through natural language prompts, to generate and critically evaluate code for +solving computational tasks. To address this, we developed a novel platform for +prompt programming that enables authentic dialogue-based interactions, supports +problems involving multiple interdependent functions, and offers on-request +execution of generated code. Data analysis from over 900 students in an +introductory programming course revealed high engagement, with the majority of +prompts occurring within multi-turn dialogues. Problems with multiple +interdependent functions encouraged iterative refinement, with progression +graphs highlighting several common strategies. Students were highly selective +about the code they chose to test, suggesting that on-request execution of +generated code promoted critical thinking. Given the growing importance of +learning dialogue-based programming with AI, we provide this tool as a publicly +accessible resource, accompanied by a corpus of programming problems for +educational use. + +
+
+ comment: Preprint of the ITiCSE'25 paper +
+
+
+
+
+ + ☆ Guidelines for Applying RL and MARL in Cybersecurity Applications + + +
+ Reinforcement Learning (RL) and Multi-Agent Reinforcement Learning (MARL) +have emerged as promising methodologies for addressing challenges in automated +cyber defence (ACD). These techniques offer adaptive decision-making +capabilities in high-dimensional, adversarial environments. This report +provides a structured set of guidelines for cybersecurity professionals and +researchers to assess the suitability of RL and MARL for specific use cases, +considering factors such as explainability, exploration needs, and the +complexity of multi-agent coordination. It also discusses key algorithmic +approaches, implementation challenges, and real-world constraints, such as data +scarcity and adversarial interference. The report further outlines open +research questions, including policy optimality, agent cooperation levels, and +the integration of MARL systems into operational cybersecurity frameworks. By +bridging theoretical advancements and practical deployment, these guidelines +aim to enhance the effectiveness of AI-driven cyber defence strategies. + +
+
+
+
+
+ + ☆ VirtualXAI: A User-Centric Framework for Explainability Assessment + Leveraging GPT-Generated Personas + + +
+ In today's data-driven era, computational systems generate vast amounts of +data that drive the digital transformation of industries, where Artificial +Intelligence (AI) plays a key role. Currently, the demand for eXplainable AI +(XAI) has increased to enhance the interpretability, transparency, and +trustworthiness of AI models. However, evaluating XAI methods remains +challenging: existing evaluation frameworks typically focus on quantitative +properties such as fidelity, consistency, and stability without taking into +account qualitative characteristics such as satisfaction and interpretability. +In addition, practitioners face a lack of guidance in selecting appropriate +datasets, AI models, and XAI methods -a major hurdle in human-AI collaboration. +To address these gaps, we propose a framework that integrates quantitative +benchmarking with qualitative user assessments through virtual personas based +on the "Anthology" of backstories of the Large Language Model (LLM). Our +framework also incorporates a content-based recommender system that leverages +dataset-specific characteristics to match new input data with a repository of +benchmarked datasets. This yields an estimated XAI score and provides tailored +recommendations for both the optimal AI model and the XAI method for a given +scenario. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ TAIL: Text-Audio Incremental Learning + + +
+ Many studies combine text and audio to capture multi-modal information but +they overlook the model's generalization ability on new datasets. Introducing +new datasets may affect the feature space of the original dataset, leading to +catastrophic forgetting. Meanwhile, large model parameters can significantly +impact training performance. To address these limitations, we introduce a novel +task called Text-Audio Incremental Learning (TAIL) task for text-audio +retrieval, and propose a new method, PTAT, Prompt Tuning for Audio-Text +incremental learning. This method utilizes prompt tuning to optimize the model +parameters while incorporating an audio-text similarity and feature +distillation module to effectively mitigate catastrophic forgetting. We +benchmark our method and previous incremental learning methods on AudioCaps, +Clotho, BBC Sound Effects and Audioset datasets, and our method outperforms +previous methods significantly, particularly demonstrating stronger resistance +to forgetting on older datasets. Compared to the full-parameters Finetune +(Sequential) method, our model only requires 2.42\% of its parameters, +achieving 4.46\% higher performance. + +
+
+ comment: 4 figures, 5 tables +
+
+
+
+
+ + ☆ How to Move Your Dragon: Text-to-Motion Synthesis for Large-Vocabulary + Objects + + +
+ Motion synthesis for diverse object categories holds great potential for 3D +content creation but remains underexplored due to two key challenges: (1) the +lack of comprehensive motion datasets that include a wide range of high-quality +motions and annotations, and (2) the absence of methods capable of handling +heterogeneous skeletal templates from diverse objects. To address these +challenges, we contribute the following: First, we augment the Truebones Zoo +dataset, a high-quality animal motion dataset covering over 70 species, by +annotating it with detailed text descriptions, making it suitable for +text-based motion synthesis. Second, we introduce rig augmentation techniques +that generate diverse motion data while preserving consistent dynamics, +enabling models to adapt to various skeletal configurations. Finally, we +redesign existing motion diffusion models to dynamically adapt to arbitrary +skeletal templates, enabling motion synthesis for a diverse range of objects +with varying structures. Experiments show that our method learns to generate +high-fidelity motions from textual descriptions for diverse and even unseen +objects, setting a strong foundation for motion synthesis across diverse object +categories and skeletal templates. Qualitative results are available on this +link: t2m4lvo.github.io + +
+
+
+
+
+ + ☆ Knowledge Retention for Continual Model-Based Reinforcement Learning + + +
+ We propose DRAGO, a novel approach for continual model-based reinforcement +learning aimed at improving the incremental development of world models across +a sequence of tasks that differ in their reward functions but not the state +space or dynamics. DRAGO comprises two key components: Synthetic Experience +Rehearsal, which leverages generative models to create synthetic experiences +from past tasks, allowing the agent to reinforce previously learned dynamics +without storing data, and Regaining Memories Through Exploration, which +introduces an intrinsic reward mechanism to guide the agent toward revisiting +relevant states from prior tasks. Together, these components enable the agent +to maintain a comprehensive and continually developing world model, +facilitating more effective learning and adaptation across diverse +environments. Empirical evaluations demonstrate that DRAGO is able to preserve +knowledge across tasks, achieving superior performance in various continual +learning scenarios. + +
+
+
+
+
+ + ☆ How to Mitigate Overfitting in Weak-to-strong Generalization? + + +
+ Aligning powerful AI models on tasks that surpass human evaluation +capabilities is the central problem of \textbf{superalignment}. To address this +problem, weak-to-strong generalization aims to elicit the capabilities of +strong models through weak supervisors and ensure that the behavior of strong +models aligns with the intentions of weak supervisors without unsafe behaviors +such as deception. Although weak-to-strong generalization exhibiting certain +generalization capabilities, strong models exhibit significant overfitting in +weak-to-strong generalization: Due to the strong fit ability of strong models, +erroneous labels from weak supervisors may lead to overfitting in strong +models. In addition, simply filtering out incorrect labels may lead to a +degeneration in question quality, resulting in a weak generalization ability of +strong models on hard questions. To mitigate overfitting in weak-to-strong +generalization, we propose a two-stage framework that simultaneously improves +the quality of supervision signals and the quality of input questions. +Experimental results in three series of large language models and two +mathematical benchmarks demonstrate that our framework significantly improves +PGR compared to naive weak-to-strong generalization, even achieving up to 100\% +PGR on some models. + +
+
+
+
+
+ + ☆ One-Shot Clustering for Federated Learning + + +
+ Federated Learning (FL) is a widespread and well adopted paradigm of +decentralized learning that allows training one model from multiple sources +without the need to directly transfer data between participating clients. Since +its inception in 2015, it has been divided into numerous sub-fields that deal +with application-specific issues, be it data heterogeneity or resource +allocation. One such sub-field, Clustered Federated Learning (CFL), is dealing +with the problem of clustering the population of clients into separate cohorts +to deliver personalized models. Although few remarkable works have been +published in this domain, the problem is still largely unexplored, as its basic +assumption and settings are slightly different from standard FL. In this work, +we present One-Shot Clustered Federated Learning (OCFL), a clustering-agnostic +algorithm that can automatically detect the earliest suitable moment for +clustering. Our algorithm is based on the computation of cosine similarity +between gradients of the clients and a temperature measure that detects when +the federated model starts to converge. We empirically evaluate our methodology +by testing various one-shot clustering algorithms for over thirty different +tasks on three benchmark datasets. Our experiments showcase the good +performance of our approach when used to perform CFL in an automated manner +without the need to adjust hyperparameters. + +
+
+
+
+
+ + ☆ Quantum-Inspired Reinforcement Learning in the Presence of Epistemic + Ambivalence + + +
+ The complexity of online decision-making under uncertainty stems from the +requirement of finding a balance between exploiting known strategies and +exploring new possibilities. Naturally, the uncertainty type plays a crucial +role in developing decision-making strategies that manage complexity +effectively. In this paper, we focus on a specific form of uncertainty known as +epistemic ambivalence (EA), which emerges from conflicting pieces of evidence +or contradictory experiences. It creates a delicate interplay between +uncertainty and confidence, distinguishing it from epistemic uncertainty that +typically diminishes with new information. Indeed, ambivalence can persist even +after additional knowledge is acquired. To address this phenomenon, we propose +a novel framework, called the epistemically ambivalent Markov decision process +(EA-MDP), aiming to understand and control EA in decision-making processes. +This framework incorporates the concept of a quantum state from the quantum +mechanics formalism, and its core is to assess the probability and reward of +every possible outcome. We calculate the reward function using quantum +measurement techniques and prove the existence of an optimal policy and an +optimal value function in the EA-MDP framework. We also propose the +EA-epsilon-greedy Q-learning algorithm. To evaluate the impact of EA on +decision-making and the expedience of our framework, we study two distinct +experimental setups, namely the two-state problem and the lattice problem. Our +results show that using our methods, the agent converges to the optimal policy +in the presence of EA. + +
+
+
+
+
+ + ☆ Knowledge-Decoupled Synergetic Learning: An MLLM based Collaborative + Approach to Few-shot Multimodal Dialogue Intention Recognition + + +
+ Few-shot multimodal dialogue intention recognition is a critical challenge in +the e-commerce domainn. Previous methods have primarily enhanced model +classification capabilities through post-training techniques. However, our +analysis reveals that training for few-shot multimodal dialogue intention +recognition involves two interconnected tasks, leading to a seesaw effect in +multi-task learning. This phenomenon is attributed to knowledge interference +stemming from the superposition of weight matrix updates during the training +process. To address these challenges, we propose Knowledge-Decoupled Synergetic +Learning (KDSL), which mitigates these issues by utilizing smaller models to +transform knowledge into interpretable rules, while applying the post-training +of larger models. By facilitating collaboration between the large and small +multimodal large language models for prediction, our approach demonstrates +significant improvements. Notably, we achieve outstanding results on two real +Taobao datasets, with enhancements of 6.37\% and 6.28\% in online weighted F1 +scores compared to the state-of-the-art method, thereby validating the efficacy +of our framework. + +
+
+
+
+
+ + ☆ MASTER: Multimodal Segmentation with Text Prompts + + +
+ RGB-Thermal fusion is a potential solution for various weather and light +conditions in challenging scenarios. However, plenty of studies focus on +designing complex modules to fuse different modalities. With the widespread +application of large language models (LLMs), valuable information can be more +effectively extracted from natural language. Therefore, we aim to leverage the +advantages of large language models to design a structurally simple and highly +adaptable multimodal fusion model architecture. We proposed MultimodAl +Segmentation with TExt PRompts (MASTER) architecture, which integrates LLM into +the fusion of RGB-Thermal multimodal data and allows complex query text to +participate in the fusion process. Our model utilizes a dual-path structure to +extract information from different modalities of images. Additionally, we +employ LLM as the core module for multimodal fusion, enabling the model to +generate learnable codebook tokens from RGB, thermal images, and textual +information. A lightweight image decoder is used to obtain semantic +segmentation results. The proposed MASTER performs exceptionally well in +benchmark tests across various automated driving scenarios, yielding promising +results. + +
+
+
+
+
+ + ☆ Large-Scale AI in Telecom: Charting the Roadmap for Innovation, + Scalability, and Enhanced Digital Experiences + + +
+ This white paper discusses the role of large-scale AI in the +telecommunications industry, with a specific focus on the potential of +generative AI to revolutionize network functions and user experiences, +especially in the context of 6G systems. It highlights the development and +deployment of Large Telecom Models (LTMs), which are tailored AI models +designed to address the complex challenges faced by modern telecom networks. +The paper covers a wide range of topics, from the architecture and deployment +strategies of LTMs to their applications in network management, resource +allocation, and optimization. It also explores the regulatory, ethical, and +standardization considerations for LTMs, offering insights into their future +integration into telecom infrastructure. The goal is to provide a comprehensive +roadmap for the adoption of LTMs to enhance scalability, performance, and +user-centric innovation in telecom networks. + +
+
+
+
+
+ + ☆ CrowdHMTware: A Cross-level Co-adaptation Middleware for Context-aware + Mobile DL Deployment + + +
+ There are many deep learning (DL) powered mobile and wearable applications +today continuously and unobtrusively sensing the ambient surroundings to +enhance all aspects of human lives.To enable robust and private mobile sensing, +DL models are often deployed locally on resource-constrained mobile devices +using techniques such as model compression or offloading.However, existing +methods, either front-end algorithm level (i.e. DL model +compression/partitioning) or back-end scheduling level (i.e. operator/resource +scheduling), cannot be locally online because they require offline retraining +to ensure accuracy or rely on manually pre-defined strategies, struggle with +dynamic adaptability.The primary challenge lies in feeding back runtime +performance from the back-end level to the front-end level optimization +decision. Moreover, the adaptive mobile DL model porting middleware with +cross-level co-adaptation is less explored, particularly in mobile environments +with diversity and dynamics. In response, we introduce CrowdHMTware, a dynamic +context-adaptive DL model deployment middleware for heterogeneous mobile +devices. It establishes an automated adaptation loop between cross-level +functional components, i.e. elastic inference, scalable offloading, and +model-adaptive engine, enhancing scalability and adaptability. Experiments with +four typical tasks across 15 platforms and a real-world case study demonstrate +that CrowdHMTware can effectively scale DL model, offloading, and engine +actions across diverse platforms and tasks. It hides run-time system issues +from developers, reducing the required developer expertise. + +
+
+ comment: This paper is accepted by IEEE Transactions on Mobile Computing +
+
+
+
+
+ + ☆ TIMER: Temporal Instruction Modeling and Evaluation for Longitudinal + Clinical Records + + +
+ Large language models (LLMs) have emerged as promising tools for assisting in +medical tasks, yet processing Electronic Health Records (EHRs) presents unique +challenges due to their longitudinal nature. While LLMs' capabilities to +perform medical tasks continue to improve, their ability to reason over +temporal dependencies across multiple patient visits and time frames remains +unexplored. We introduce TIMER (Temporal Instruction Modeling and Evaluation +for Longitudinal Clinical Records), a framework that incorporate +instruction-response pairs grounding to different parts of a patient's record +as a critical dimension in both instruction evaluation and tuning for +longitudinal clinical records. We develop TIMER-Bench, the first time-aware +benchmark that evaluates temporal reasoning capabilities over longitudinal +EHRs, as well as TIMER-Instruct, an instruction-tuning methodology for LLMs to +learn reasoning over time. We demonstrate that models fine-tuned with +TIMER-Instruct improve performance by 7.3% on human-generated benchmarks and +9.2% on TIMER-Bench, indicating that temporal instruction-tuning improves model +performance for reasoning over EHR. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Towards Intelligent Transportation with Pedestrians and Vehicles + In-the-Loop: A Surveillance Video-Assisted Federated Digital Twin Framework + + +
+ In intelligent transportation systems (ITSs), incorporating pedestrians and +vehicles in-the-loop is crucial for developing realistic and safe traffic +management solutions. However, there is falls short of simulating complex +real-world ITS scenarios, primarily due to the lack of a digital twin +implementation framework for characterizing interactions between pedestrians +and vehicles at different locations in different traffic environments. In this +article, we propose a surveillance video assisted federated digital twin +(SV-FDT) framework to empower ITSs with pedestrians and vehicles in-the-loop. +Specifically, SVFDT builds comprehensive pedestrian-vehicle interaction models +by leveraging multi-source traffic surveillance videos. Its architecture +consists of three layers: (i) the end layer, which collects traffic +surveillance videos from multiple sources; (ii) the edge layer, responsible for +semantic segmentation-based visual understanding, twin agent-based interaction +modeling, and local digital twin system (LDTS) creation in local regions; and +(iii) the cloud layer, which integrates LDTSs across different regions to +construct a global DT model in realtime. We analyze key design requirements and +challenges and present core guidelines for SVFDT's system implementation. A +testbed evaluation demonstrates its effectiveness in optimizing traffic +management. Comparisons with traditional terminal-server frameworks highlight +SV-FDT's advantages in mirroring delays, recognition accuracy, and subjective +evaluation. Finally, we identify some open challenges and discuss future +research directions. + +
+
+
+
+
+ + ☆ The Role of Visual Modality in Multimodal Mathematical Reasoning: + Challenges and Insights + + +
+ Recent research has increasingly focused on multimodal mathematical +reasoning, particularly emphasizing the creation of relevant datasets and +benchmarks. Despite this, the role of visual information in reasoning has been +underexplored. Our findings show that existing multimodal mathematical models +minimally leverage visual information, and model performance remains largely +unaffected by changes to or removal of images in the dataset. We attribute this +to the dominance of textual information and answer options that inadvertently +guide the model to correct answers. To improve evaluation methods, we introduce +the HC-M3D dataset, specifically designed to require image reliance for +problem-solving and to challenge models with similar, yet distinct, images that +change the correct answer. In testing leading models, their failure to detect +these subtle visual differences suggests limitations in current visual +perception capabilities. Additionally, we observe that the common approach of +improving general VQA capabilities by combining various types of image encoders +does not contribute to math reasoning performance. This finding also presents a +challenge to enhancing visual reliance during math reasoning. Our benchmark and +code would be available at +\href{https://github.com/Yufang-Liu/visual_modality_role}{https://github.com/Yufang-Liu/visual\_modality\_role}. + +
+
+
+
+
+ + ☆ Semantic Retrieval Augmented Contrastive Learning for Sequential + Recommendation + + +
+ Sequential recommendation aims to model user preferences based on historical +behavior sequences, which is crucial for various online platforms. Data +sparsity remains a significant challenge in this area as most users have +limited interactions and many items receive little attention. To mitigate this +issue, contrastive learning has been widely adopted. By constructing positive +sample pairs from the data itself and maximizing their agreement in the +embedding space,it can leverage available data more effectively. Constructing +reasonable positive sample pairs is crucial for the success of contrastive +learning. However, current approaches struggle to generate reliable positive +pairs as they either rely on representations learned from inherently sparse +collaborative signals or use random perturbations which introduce significant +uncertainty. To address these limitations, we propose a novel approach named +Semantic Retrieval Augmented Contrastive Learning (SRA-CL), which leverages +semantic information to improve the reliability of contrastive samples. SRA-CL +comprises two main components: (1) Cross-Sequence Contrastive Learning via User +Semantic Retrieval, which utilizes large language models (LLMs) to understand +diverse user preferences and retrieve semantically similar users to form +reliable positive samples through a learnable sample synthesis method; and (2) +Intra-Sequence Contrastive Learning via Item Semantic Retrieval, which employs +LLMs to comprehend items and retrieve similar items to perform semantic-based +item substitution, thereby creating semantically consistent augmented views for +contrastive learning. SRA-CL is plug-and-play and can be integrated into +standard sequential recommendation models. Extensive experiments on four public +datasets demonstrate the effectiveness and generalizability of the proposed +approach. + +
+
+
+
+
+ + ☆ Unseen Fake News Detection Through Casual Debiasing + + +
+ The widespread dissemination of fake news on social media poses significant +risks, necessitating timely and accurate detection. However, existing methods +struggle with unseen news due to their reliance on training data from past +events and domains, leaving the challenge of detecting novel fake news largely +unresolved. To address this, we identify biases in training data tied to +specific domains and propose a debiasing solution FNDCD. Originating from +causal analysis, FNDCD employs a reweighting strategy based on classification +confidence and propagation structure regularization to reduce the influence of +domain-specific biases, enhancing the detection of unseen fake news. +Experiments on real-world datasets with non-overlapping news domains +demonstrate FNDCD's effectiveness in improving generalization across domains. + +
+
+ comment: 2025 The Web Conference, 6 pages, 4 figures +
+
+
+
+
+ + ☆ CA-W3D: Leveraging Context-Aware Knowledge for Weakly Supervised + Monocular 3D Detection + + +
+ Weakly supervised monocular 3D detection, while less annotation-intensive, +often struggles to capture the global context required for reliable 3D +reasoning. Conventional label-efficient methods focus on object-centric +features, neglecting contextual semantic relationships that are critical in +complex scenes. In this work, we propose a Context-Aware Weak Supervision for +Monocular 3D object detection, namely CA-W3D, to address this limitation in a +two-stage training paradigm. Specifically, we first introduce a pre-training +stage employing Region-wise Object Contrastive Matching (ROCM), which aligns +regional object embeddings derived from a trainable monocular 3D encoder and a +frozen open-vocabulary 2D visual grounding model. This alignment encourages the +monocular encoder to discriminate scene-specific attributes and acquire richer +contextual knowledge. In the second stage, we incorporate a pseudo-label +training process with a Dual-to-One Distillation (D2OD) mechanism, which +effectively transfers contextual priors into the monocular encoder while +preserving spatial fidelity and maintaining computational efficiency during +inference. Extensive experiments conducted on the public KITTI benchmark +demonstrate the effectiveness of our approach, surpassing the SoTA method over +all metrics, highlighting the importance of contextual-aware knowledge in +weakly-supervised monocular 3D detection. + +
+
+ comment: The paper includes 8 pages, 6 figures and 4 tables +
+
+
+
+
+ + ☆ KidneyTalk-open: No-code Deployment of a Private Large Language Model + with Medical Documentation-Enhanced Knowledge Database for Kidney Disease + + +
+ Privacy-preserving medical decision support for kidney disease requires +localized deployment of large language models (LLMs) while maintaining clinical +reasoning capabilities. Current solutions face three challenges: 1) Cloud-based +LLMs pose data security risks; 2) Local model deployment demands technical +expertise; 3) General LLMs lack mechanisms to integrate medical knowledge. +Retrieval-augmented systems also struggle with medical document processing and +clinical usability. We developed KidneyTalk-open, a desktop system integrating +three technical components: 1) No-code deployment of state-of-the-art (SOTA) +open-source LLMs (such as DeepSeek-r1, Qwen2.5) via local inference engine; 2) +Medical document processing pipeline combining context-aware chunking and +intelligent filtering; 3) Adaptive Retrieval and Augmentation Pipeline (AddRep) +employing agents collaboration for improving the recall rate of medical +documents. A graphical interface was designed to enable clinicians to manage +medical documents and conduct AI-powered consultations without technical +expertise. Experimental validation on 1,455 challenging nephrology exam +questions demonstrates AddRep's effectiveness: achieving 29.1% accuracy (+8.1% +over baseline) with intelligent knowledge integration, while maintaining +robustness through 4.9% rejection rate to suppress hallucinations. Comparative +case studies with the mainstream products (AnythingLLM, Chatbox, GPT4ALL) +demonstrate KidneyTalk-open's superior performance in real clinical query. +KidneyTalk-open represents the first no-code medical LLM system enabling secure +documentation-enhanced medical Q&A on desktop. Its designs establishes a new +framework for privacy-sensitive clinical AI applications. The system +significantly lowers technical barriers while improving evidence traceability, +enabling more medical staff or patients to use SOTA open-source LLMs +conveniently. + +
+
+ comment: Corresponding authors: zhanglx@bjmu.edu.cn; joy_yuxi@pku.edu.cn; + hongshenda@pku.edu.cn +
+
+
+
+
+ + ☆ Robust Multi-View Learning via Representation Fusion of Sample-Level + Attention and Alignment of Simulated Perturbation + + +
+ Recently, multi-view learning (MVL) has garnered significant attention due to +its ability to fuse discriminative information from multiple views. However, +real-world multi-view datasets are often heterogeneous and imperfect, which +usually makes MVL methods designed for specific combinations of views lack +application potential and limits their effectiveness. To address this issue, we +propose a novel robust MVL method (namely RML) with simultaneous representation +fusion and alignment. Specifically, we introduce a simple yet effective +multi-view transformer fusion network where we transform heterogeneous +multi-view data into homogeneous word embeddings, and then integrate multiple +views by the sample-level attention mechanism to obtain a fused representation. +Furthermore, we propose a simulated perturbation based multi-view contrastive +learning framework that dynamically generates the noise and unusable +perturbations for simulating imperfect data conditions. The simulated noisy and +unusable data obtain two distinct fused representations, and we utilize +contrastive learning to align them for learning discriminative and robust +representations. Our RML is self-supervised and can also be applied for +downstream tasks as a regularization. In experiments, we employ it in +unsupervised multi-view clustering, noise-label classification, and as a +plug-and-play module for cross-modal hashing retrieval. Extensive comparison +experiments and ablation studies validate the effectiveness of RML. + +
+
+
+
+
+ + ☆ Ticktack : Long Span Temporal Alignment of Large Language Models + Leveraging Sexagenary Cycle Time Expression + + +
+ Large language models (LLMs) suffer from temporal misalignment issues +especially across long span of time. The issue arises from knowing that LLMs +are trained on large amounts of data where temporal information is rather +sparse over long times, such as thousands of years, resulting in insufficient +learning or catastrophic forgetting by the LLMs. This paper proposes a +methodology named "Ticktack" for addressing the LLM's long-time span +misalignment in a yearly setting. Specifically, we first propose to utilize the +sexagenary year expression instead of the Gregorian year expression employed by +LLMs, achieving a more uniform distribution in yearly granularity. Then, we +employ polar coordinates to model the sexagenary cycle of 60 terms and the year +order within each term, with additional temporal encoding to ensure LLMs +understand them. Finally, we present a temporal representational alignment +approach for post-training LLMs that effectively distinguishes time points with +relevant knowledge, hence improving performance on time-related tasks, +particularly over a long period. We also create a long time span benchmark for +evaluation. Experimental results prove the effectiveness of our proposal. + +
+
+
+
+
+ + ☆ Dynamic Benchmarking of Reasoning Capabilities in Code Large Language + Models Under Data Contamination + + +
+ The rapid evolution of code largelanguage models underscores the need for +effective and transparent benchmarking of their reasoning capabilities. +However, the current benchmarking approach heavily depends on publicly +available, human-created datasets. The widespread use of these fixed benchmark +datasets makes the benchmarking process to be static and thus particularly +susceptible to data contamination, an unavoidable consequence of the extensive +data collection processes used to train Code LLMs. Existing approaches that +address data contamination often suffer from human effort limitations and +imbalanced problem complexity. To tackle these challenges, we propose \tool, a +novel benchmarking suite for evaluating Code LLMs under potential data +contamination. Given a seed programming problem, \tool employs multiple agents +to extract and modify the context without altering the core logic, generating +semantically equivalent variations. We introduce a dynamic data generation +methods and conduct empirical studies on two seed datasets across 21 Code LLMs. +Results show that \tool effectively benchmarks reasoning capabilities under +contamination risks while generating diverse problem sets to ensure consistent +and reliable evaluations. + +
+
+ comment: https://codekaleidoscope.github.io/dycodeeval.html +
+
+
+
+
+ + ☆ DM-Adapter: Domain-Aware Mixture-of-Adapters for Text-Based Person + Retrieval AAAI 2025 + + +
+ Text-based person retrieval (TPR) has gained significant attention as a +fine-grained and challenging task that closely aligns with practical +applications. Tailoring CLIP to person domain is now a emerging research topic +due to the abundant knowledge of vision-language pretraining, but challenges +still remain during fine-tuning: (i) Previous full-model fine-tuning in TPR is +computationally expensive and prone to overfitting.(ii) Existing +parameter-efficient transfer learning (PETL) for TPR lacks of fine-grained +feature extraction. To address these issues, we propose Domain-Aware +Mixture-of-Adapters (DM-Adapter), which unifies Mixture-of-Experts (MOE) and +PETL to enhance fine-grained feature representations while maintaining +efficiency. Specifically, Sparse Mixture-of-Adapters is designed in parallel to +MLP layers in both vision and language branches, where different experts +specialize in distinct aspects of person knowledge to handle features more +finely. To promote the router to exploit domain information effectively and +alleviate the routing imbalance, Domain-Aware Router is then developed by +building a novel gating function and injecting learnable domain-aware prompts. +Extensive experiments show that our DM-Adapter achieves state-of-the-art +performance, outperforming previous methods by a significant margin. + +
+
+ comment: 9 pages, 5 figures, accepted by AAAI 2025 +
+
+
+
+
+ + ☆ MTS: A Deep Reinforcement Learning Portfolio Management Framework with + Time-Awareness and Short-Selling + + +
+ Portfolio management remains a crucial challenge in finance, with traditional +methods often falling short in complex and volatile market environments. While +deep reinforcement approaches have shown promise, they still face limitations +in dynamic risk management, exploitation of temporal markets, and incorporation +of complex trading strategies such as short-selling. These limitations can lead +to suboptimal portfolio performance, increased vulnerability to market +volatility, and missed opportunities in capturing potential returns from +diverse market conditions. This paper introduces a Deep Reinforcement Learning +Portfolio Management Framework with Time-Awareness and Short-Selling (MTS), +offering a robust and adaptive strategy for sustainable investment performance. +This framework utilizes a novel encoder-attention mechanism to address the +limitations by incorporating temporal market characteristics, a parallel +strategy for automated short-selling based on market trends, and risk +management through innovative Incremental Conditional Value at Risk, enhancing +adaptability and performance. Experimental validation on five diverse datasets +from 2019 to 2023 demonstrates MTS's superiority over traditional algorithms +and advanced machine learning techniques. MTS consistently achieves higher +cumulative returns, Sharpe, Omega, and Sortino ratios, underscoring its +effectiveness in balancing risk and return while adapting to market dynamics. +MTS demonstrates an average relative increase of 30.67% in cumulative returns +and 29.33% in Sharpe ratio compared to the next best-performing strategies +across various datasets. + +
+
+
+
+
+ + ☆ Artificial Intelligence in Pronunciation Teaching: Use and Beliefs of + Foreign Language Teachers + + +
+ Pronunciation instruction in foreign language classrooms has often been an +overlooked area of focus. With the widespread adoption of Artificial +Intelligence (AI) and its potential benefits, investigating how AI is utilized +in pronunciation teaching and understanding the beliefs of teachers about this +tool is essential for improving learning outcomes. This study aims to examine +how AI use for pronunciation instruction varies across different demographic +and professional factors among teachers, and how these factors, including AI +use, influence the beliefs of teachers about AI. The study involved 117 English +as a Foreign Language (EFL) in-service teachers working in Cyprus, who +completed an online survey designed to assess their beliefs about the +effectiveness of AI, its drawbacks, and their willingness to integrate AI into +their teaching practices. The results revealed that teachers were significantly +more likely to agree on the perceived effectiveness of AI and their willingness +to adopt it, compared to their concerns about its use. Furthermore, teachers +working in higher education and adult education, as well as those who had +received more extensive training, reported using AI more frequently in their +teaching. Teachers who utilized AI more often expressed stronger agreement with +its effectiveness, while those who had received more training were less likely +to express concerns about its integration. Given the limited training that many +teachers currently receive, these findings demonstrate the need for tailored +training sessions that address the specific needs and concerns of educators, +ultimately fostering the adoption of AI in pronunciation instruction. + +
+
+
+
+
+ + ☆ Simple Self Organizing Map with Visual Transformer + + +
+ Vision Transformers (ViTs) have demonstrated exceptional performance in +various vision tasks. However, they tend to underperform on smaller datasets +due to their inherent lack of inductive biases. Current approaches address this +limitation implicitly-often by pairing ViTs with pretext tasks or by distilling +knowledge from convolutional neural networks (CNNs) to strengthen the prior. In +contrast, Self-Organizing Maps (SOMs), a widely adopted self-supervised +framework, are inherently structured to preserve topology and spatial +organization, making them a promising candidate to directly address the +limitations of ViTs in limited or small training datasets. Despite this +potential, equipping SOMs with modern deep learning architectures remains +largely unexplored. In this study, we conduct a novel exploration on how Vision +Transformers (ViTs) and Self-Organizing Maps (SOMs) can empower each other, +aiming to bridge this critical research gap. Our findings demonstrate that +these architectures can synergistically enhance each other, leading to +significantly improved performance in both unsupervised and supervised tasks. +Code will be publicly available. + +
+
+ comment: 5 pages, 4 figures. Submitted to IEEE. All experiments and code work + were performed by the first author, with the second author serving in a + PI/mentor role, guiding the progression of the work +
+
+
+
+
+ + ☆ Generalizability of Neural Networks Minimizing Empirical Risk Based on + Expressive Ability + + +
+ The primary objective of learning methods is generalization. Classic uniform +generalization bounds, which rely on VC-dimension or Rademacher complexity, +fail to explain the significant attribute that over-parameterized models in +deep learning exhibit nice generalizability. On the other hand, +algorithm-dependent generalization bounds, like stability bounds, often rely on +strict assumptions. To establish generalizability under less stringent +assumptions, this paper investigates the generalizability of neural networks +that minimize or approximately minimize empirical risk. We establish a lower +bound for population accuracy based on the expressiveness of these networks, +which indicates that with an adequate large number of training samples and +network sizes, these networks, including over-parameterized ones, can +generalize effectively. Additionally, we provide a necessary condition for +generalization, demonstrating that, for certain data distributions, the +quantity of training data required to ensure generalization exceeds the network +size needed to represent the corresponding data distribution. Finally, we +provide theoretical insights into several phenomena in deep learning, including +robust generalization, importance of over-parameterization, and effect of loss +function on generalization. + +
+
+
+
+
+ + ♻ ☆ How Far Are We on the Decision-Making of LLMs? Evaluating LLMs' Gaming + Ability in Multi-Agent Environments ICLR 2025 + + +
+ Decision-making is a complex process requiring diverse abilities, making it +an excellent framework for evaluating Large Language Models (LLMs). Researchers +have examined LLMs' decision-making through the lens of Game Theory. However, +existing evaluation mainly focus on two-player scenarios where an LLM competes +against another. Additionally, previous benchmarks suffer from test set leakage +due to their static design. We introduce GAMA($\gamma$)-Bench, a new framework +for evaluating LLMs' Gaming Ability in Multi-Agent environments. It includes +eight classical game theory scenarios and a dynamic scoring scheme specially +designed to quantitatively assess LLMs' performance. $\gamma$-Bench allows +flexible game settings and adapts the scoring system to different game +parameters, enabling comprehensive evaluation of robustness, generalizability, +and strategies for improvement. Our results indicate that GPT-3.5 demonstrates +strong robustness but limited generalizability, which can be enhanced using +methods like Chain-of-Thought. We also evaluate 13 LLMs from 6 model families, +including GPT-3.5, GPT-4, Gemini, LLaMA-3.1, Mixtral, and Qwen-2. +Gemini-1.5-Pro outperforms others, scoring of $69.8$ out of $100$, followed by +LLaMA-3.1-70B ($65.9$) and Mixtral-8x22B ($62.4$). Our code and experimental +results are publicly available at https://github.com/CUHK-ARISE/GAMABench. + +
+
+ comment: Accepted to ICLR 2025; 11 pages of main text; 26 pages of appendices; + Included models: GPT-3.5-{0613, 1106, 0125}, GPT-4-0125, GPT-4o-0806, + Gemini-{1.0, 1.5)-Pro, LLaMA-3.1-{7, 70, 405}B, Mixtral-8x{7, 22}B, + Qwen-2-72B +
+
+
+
+
+ + ♻ ☆ DEFT: Differentiable Branched Discrete Elastic Rods for Modeling + Furcated DLOs in Real-Time + + +
+ Autonomous wire harness assembly requires robots to manipulate complex +branched cables with high precision and reliability. A key challenge in +automating this process is predicting how these flexible and branched +structures behave under manipulation. Without accurate predictions, it is +difficult for robots to reliably plan or execute assembly operations. While +existing research has made progress in modeling single-threaded Deformable +Linear Objects (DLOs), extending these approaches to Branched Deformable Linear +Objects (BDLOs) presents fundamental challenges. The junction points in BDLOs +create complex force interactions and strain propagation patterns that cannot +be adequately captured by simply connecting multiple single-DLO models. To +address these challenges, this paper presents Differentiable discrete branched +Elastic rods for modeling Furcated DLOs in real-Time (DEFT), a novel framework +that combines a differentiable physics-based model with a learning framework +to: 1) accurately model BDLO dynamics, including dynamic propagation at +junction points and grasping in the middle of a BDLO, 2) achieve efficient +computation for real-time inference, and 3) enable planning to demonstrate +dexterous BDLO manipulation. A comprehensive series of real-world experiments +demonstrates DEFT's efficacy in terms of accuracy, computational speed, and +generalizability compared to state-of-the-art alternatives. Project +page:https://roahmlab.github.io/DEFT/. + +
+
+
+
+
+ + ♻ ☆ Do Not Trust Licenses You See -- Dataset Compliance Requires + Massive-Scale AI-Powered Lifecycle Tracing + + +
+ This paper argues that a dataset's legal risk cannot be accurately assessed +by its license terms alone; instead, tracking dataset redistribution and its +full lifecycle is essential. However, this process is too complex for legal +experts to handle manually at scale. Tracking dataset provenance, verifying +redistribution rights, and assessing evolving legal risks across multiple +stages require a level of precision and efficiency that exceeds human +capabilities. Addressing this challenge effectively demands AI agents that can +systematically trace dataset redistribution, analyze compliance, and identify +legal risks. We develop an automated data compliance system called NEXUS and +show that AI can perform these tasks with higher accuracy, efficiency, and +cost-effectiveness than human experts. Our massive legal analysis of 17,429 +unique entities and 8,072 license terms using this approach reveals the +discrepancies in legal rights between the original datasets before +redistribution and their redistributed subsets, underscoring the necessity of +the data lifecycle-aware compliance. For instance, we find that out of 2,852 +datasets with commercially viable individual license terms, only 605 (21%) are +legally permissible for commercialization. This work sets a new standard for AI +data governance, advocating for a framework that systematically examines the +entire lifecycle of dataset redistribution to ensure transparent, legal, and +responsible dataset management. + +
+
+
+
+
+ + ♻ ☆ HELMET: How to Evaluate Long-Context Language Models Effectively and + Thoroughly ICLR 2025 + + +
+ Many benchmarks exist for evaluating long-context language models (LCLMs), +yet developers often rely on synthetic tasks such as needle-in-a-haystack +(NIAH) or an arbitrary subset of tasks. However, it remains unclear whether +these benchmarks reflect the diverse downstream applications of LCLMs, and such +inconsistencies further complicate model comparison. We investigate the +underlying reasons behind these practices and find that existing benchmarks +often provide noisy signals due to limited coverage of applications, +insufficient context lengths, unreliable metrics, and incompatibility with base +models. In this work, we introduce HELMET (How to Evaluate Long-context Models +Effectively and Thoroughly), a comprehensive benchmark encompassing seven +diverse, application-centric categories. We also address several issues in +previous benchmarks by adding controllable lengths up to 128K tokens, +model-based evaluation for reliable metrics, and few-shot prompting for +robustly evaluating base models. Consequently, we demonstrate that HELMET +offers more reliable and consistent rankings of frontier LCLMs. Through a +comprehensive study of 59 LCLMs, we find that (1) synthetic tasks like NIAH do +not reliably predict downstream performance; (2) the diverse categories in +HELMET exhibit distinct trends and low correlations with each other; and (3) +while most LCLMs achieve perfect NIAH scores, open-source models significantly +lag behind closed ones when tasks require full-context reasoning or following +complex instructions -- the gap widens as length increases. Finally, we +recommend using our RAG tasks for fast model development, as they are easy to +run and better predict other downstream performance; ultimately, we advocate +for a holistic evaluation across diverse tasks. + +
+
+ comment: ICLR 2025. Project page: https://princeton-nlp.github.io/HELMET/ +
+
+
+
+
+ + ♻ ☆ AdaptBot: Combining LLM with Knowledge Graphs and Human Input for + Generic-to-Specific Task Decomposition and Knowledge Refinement ICRA + + +
+ An embodied agent assisting humans is often asked to complete new tasks, and +there may not be sufficient time or labeled examples to train the agent to +perform these new tasks. Large Language Models (LLMs) trained on considerable +knowledge across many domains can be used to predict a sequence of abstract +actions for completing such tasks, although the agent may not be able to +execute this sequence due to task-, agent-, or domain-specific constraints. Our +framework addresses these challenges by leveraging the generic predictions +provided by LLM and the prior domain knowledge encoded in a Knowledge Graph +(KG), enabling an agent to quickly adapt to new tasks. The robot also solicits +and uses human input as needed to refine its existing knowledge. Based on +experimental evaluation in the context of cooking and cleaning tasks in +simulation domains, we demonstrate that the interplay between LLM, KG, and +human input leads to substantial performance gains compared with just using the +LLM. Project website{\S}: https://sssshivvvv.github.io/adaptbot/ + +
+
+ comment: Accepted to IEEE International Conference on Robotics and Automation + (ICRA) 2025 +
+
+
+
+
+ + ♻ ☆ Detecting Systematic Weaknesses in Vision Models along Predefined + Human-Understandable Dimensions + + +
+ Slice discovery methods (SDMs) are prominent algorithms for finding +systematic weaknesses in DNNs. They identify top-k semantically coherent +slices/subsets of data where a DNN-under-test has low performance. For being +directly useful, slices should be aligned with human-understandable and +relevant dimensions, which, for example, are defined by safety and domain +experts as part of the operational design domain (ODD). While SDMs can be +applied effectively on structured data, their application on image data is +complicated by the lack of semantic metadata. To address these issues, we +present an algorithm that combines foundation models for zero-shot image +classification to generate semantic metadata with methods for combinatorial +search to find systematic weaknesses in images. In contrast to existing +approaches, ours identifies weak slices that are in line with pre-defined +human-understandable dimensions. As the algorithm includes foundation models, +its intermediate and final results may not always be exact. Therefore, we +include an approach to address the impact of noisy metadata. We validate our +algorithm on both synthetic and real-world datasets, demonstrating its ability +to recover human-understandable systematic weaknesses. Furthermore, using our +approach, we identify systematic weaknesses of multiple pre-trained and +publicly available state-of-the-art computer vision DNNs. + +
+
+
+
+
+ + ♻ ☆ Back Home: A Machine Learning Approach to Seashell Classification and + Ecosystem Restoration + + +
+ In Costa Rica, an average of 5 tons of seashells are extracted from +ecosystems annually. Confiscated seashells, cannot be returned to their +ecosystems due to the lack of origin recognition. To address this issue, we +developed a convolutional neural network (CNN) specifically for seashell +identification. We built a dataset from scratch, consisting of approximately +19000 images from the Pacific and Caribbean coasts. Using this dataset, the +model achieved a classification accuracy exceeding 85%. The model has been +integrated into a user-friendly application, which has classified over 36,000 +seashells to date, delivering real-time results within 3 seconds per image. To +further enhance the system's accuracy, an anomaly detection mechanism was +incorporated to filter out irrelevant or anomalous inputs, ensuring only valid +seashell images are processed. + +
+
+
+
+
+ + ♻ ☆ Tutorial on amortized optimization + + +
+ Optimization is a ubiquitous modeling tool and is often deployed in settings +which repeatedly solve similar instances of the same problem. Amortized +optimization methods use learning to predict the solutions to problems in these +settings, exploiting the shared structure between similar problem instances. +These methods have been crucial in variational inference and reinforcement +learning and are capable of solving optimization problems many orders of +magnitudes times faster than traditional optimization methods that do not use +amortization. This tutorial presents an introduction to the amortized +optimization foundations behind these advancements and overviews their +applications in variational inference, sparse coding, gradient-based +meta-learning, control, reinforcement learning, convex optimization, optimal +transport, and deep equilibrium networks. The source code for this tutorial is +available at +https://github.com/facebookresearch/amortized-optimization-tutorial. + +
+
+ comment: Foundations and Trends in Machine Learning +
+
+
+
+
+ + ♻ ☆ A Simple and Effective Reinforcement Learning Method for Text-to-Image + Diffusion Fine-tuning + + +
+ Reinforcement learning (RL)-based fine-tuning has emerged as a powerful +approach for aligning diffusion models with black-box objectives. Proximal +policy optimization (PPO) is the most popular choice of method for policy +optimization. While effective in terms of performance, PPO is highly sensitive +to hyper-parameters and involves substantial computational overhead. REINFORCE, +on the other hand, mitigates some computational complexities such as high +memory overhead and sensitive hyper-parameter tuning, but has suboptimal +performance due to high-variance and sample inefficiency. While the variance of +the REINFORCE can be reduced by sampling multiple actions per input prompt and +using a baseline correction term, it still suffers from sample inefficiency. To +address these challenges, we systematically analyze the +efficiency-effectiveness trade-off between REINFORCE and PPO, and propose +leave-one-out PPO (LOOP), a novel RL for diffusion fine-tuning method. LOOP +combines variance reduction techniques from REINFORCE, such as sampling +multiple actions per input prompt and a baseline correction term, with the +robustness and sample efficiency of PPO via clipping and importance sampling. +Our results demonstrate that LOOP effectively improves diffusion models on +various black-box objectives, and achieves a better balance between +computational efficiency and performance. + +
+
+
+
+
+ + ♻ ☆ Human-Feedback Efficient Reinforcement Learning for Online Diffusion + Model Finetuning ICLR + + +
+ Controllable generation through Stable Diffusion (SD) fine-tuning aims to +improve fidelity, safety, and alignment with human guidance. Existing +reinforcement learning from human feedback methods usually rely on predefined +heuristic reward functions or pretrained reward models built on large-scale +datasets, limiting their applicability to scenarios where collecting such data +is costly or difficult. To effectively and efficiently utilize human feedback, +we develop a framework, HERO, which leverages online human feedback collected +on the fly during model learning. Specifically, HERO features two key +mechanisms: (1) Feedback-Aligned Representation Learning, an online training +method that captures human feedback and provides informative learning signals +for fine-tuning, and (2) Feedback-Guided Image Generation, which involves +generating images from SD's refined initialization samples, enabling faster +convergence towards the evaluator's intent. We demonstrate that HERO is 4x more +efficient in online feedback for body part anomaly correction compared to the +best existing method. Additionally, experiments show that HERO can effectively +handle tasks like reasoning, counting, personalization, and reducing NSFW +content with only 0.5K online feedback. + +
+
+ comment: Published in International Conference on Learning Representations + (ICLR) 2025 +
+
+
+
+
+ + ♻ ☆ Self-supervised pre-training with diffusion model for few-shot landmark + detection in x-ray images WACV 2025 + + +
+ Deep neural networks have been extensively applied in the medical domain for +various tasks, including image classification, segmentation, and landmark +detection. However, their application is often hindered by data scarcity, both +in terms of available annotations and images. This study introduces a novel +application of denoising diffusion probabilistic models (DDPMs) to the landmark +detection task, specifically addressing the challenge of limited annotated data +in x-ray imaging. Our key innovation lies in leveraging DDPMs for +self-supervised pre-training in landmark detection, a previously unexplored +approach in this domain. This method enables accurate landmark detection with +minimal annotated training data (as few as 50 images), surpassing both ImageNet +supervised pre-training and traditional self-supervised techniques across three +popular x-ray benchmark datasets. To our knowledge, this work represents the +first application of diffusion models for self-supervised learning in landmark +detection, which may offer a valuable pre-training approach in few-shot +regimes, for mitigating data scarcity. + +
+
+ comment: Accepted at WACV 2025 +
+
+
+
+
+ + ♻ ☆ ACC-Collab: An Actor-Critic Approach to Multi-Agent LLM Collaboration + + +
+ Large language models (LLMs) have demonstrated a remarkable ability to serve +as general-purpose tools for various language-based tasks. Recent works have +demonstrated that the efficacy of such models can be improved through iterative +dialog between multiple models. While these paradigms show promise in improving +model efficacy, most works in this area treat collaboration as an emergent +behavior, rather than a learned behavior. In doing so, current multi-agent +frameworks rely on collaborative behaviors to have been sufficiently trained +into off-the-shelf models. To address this limitation, we propose ACC-Collab, +an Actor-Critic based learning framework to produce a two-agent team (an +actor-agent and a critic-agent) specialized in collaboration. We demonstrate +that ACC-Collab outperforms SotA multi-agent techniques on a wide array of +benchmarks. + +
+
+
+
+
+ + ♻ ☆ Towards One Model for Classical Dimensionality Reduction: A + Probabilistic Perspective on UMAP and t-SNE + + +
+ This paper shows that dimensionality reduction methods such as UMAP and +t-SNE, can be approximately recast as MAP inference methods corresponding to a +model introduced in ProbDR, that describes the graph Laplacian (an estimate of +the data precision matrix) using a Wishart distribution, with a mean given by a +non-linear covariance function evaluated on the latents. This interpretation +offers deeper theoretical and semantic insights into such algorithms, by +showing that variances corresponding to these covariances are low (potentially +misspecified), and forging a connection to Gaussian process latent variable +models by showing that well-known kernels can be used to describe covariances +implied by graph Laplacians. We also introduce tools with which similar +dimensionality reduction methods can be studied. + +
+
+ comment: Updated preprint +
+
+
+
+
+ + ♻ ☆ LINGOLY-TOO: Disentangling Memorisation from Reasoning with Linguistic + Templatisation and Orthographic Obfuscation + + +
+ Assessing the reasoning capabilities of large language models (LLMs) is +susceptible to overestimation due to data exposure of evaluation benchmarks. We +introduce a framework for producing linguistic reasoning problems that reduces +the effect of memorisation in model performance estimates and apply this +framework to develop LINGOLY-TOO, a challenging benchmark for linguistic +reasoning. By developing orthographic templates, we dynamically obfuscate the +writing systems of real languages to generate numerousquestion variations. +These variations preserve the reasoning steps required for each solution while +reducing the likelihood of specific problem instances appearing in model +training data. Our experiments demonstrate that frontier models, including +Claud 3.7 Sonnet, o1-preview and DeepSeek R1, struggle with advanced reasoning. +Our analysis also shows that LLMs exhibit noticeable variance in accuracy +across permutations of the same problem, and on average perform better on +questions appearing in their original orthography. Our findings highlight the +opaque nature of response generation in LLMs and provide evidence that prior +data exposure contributes to over estimating the reasoning capabilities of +frontier models. + +
+
+
+
+
+ + ♻ ☆ Protein Large Language Models: A Comprehensive Survey + + +
+ Protein-specific large language models (Protein LLMs) are revolutionizing +protein science by enabling more efficient protein structure prediction, +function annotation, and design. While existing surveys focus on specific +aspects or applications, this work provides the first comprehensive overview of +Protein LLMs, covering their architectures, training datasets, evaluation +metrics, and diverse applications. Through a systematic analysis of over 100 +articles, we propose a structured taxonomy of state-of-the-art Protein LLMs, +analyze how they leverage large-scale protein sequence data for improved +accuracy, and explore their potential in advancing protein engineering and +biomedical research. Additionally, we discuss key challenges and future +directions, positioning Protein LLMs as essential tools for scientific +discovery in protein science. Resources are maintained at +https://github.com/Yijia-Xiao/Protein-LLM-Survey. + +
+
+ comment: 24 pages, 4 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ $\texttt{SEM-CTRL}$: Semantically Controlled Decoding + + +
+ Ensuring both syntactic and semantic correctness in Large Language Model +(LLM) outputs remains a significant challenge, despite being critical for +real-world deployment. In this paper, we introduce $\texttt{SEM-CTRL}$, a +unified approach that enforces rich context-sensitive constraints and task- and +instance-specific semantics directly on an LLM decoder. Our approach integrates +token-level MCTS, which is guided by specific syntactic and semantic +constraints. The constraints over the desired outputs are expressed using +Answer Set Grammars -- a logic-based formalism that generalizes +context-sensitive grammars while incorporating background knowledge to +represent task-specific semantics. We show that our approach guarantees correct +completions for any off-the-shelf LLM without the need for fine-tuning. We +evaluate $\texttt{SEM-CTRL}$ on a range of tasks, including synthetic grammar +synthesis, combinatorial reasoning, and planning. Our results demonstrate that +$\texttt{SEM-CTRL}$ allows small pre-trained LLMs to efficiently outperform +larger variants and state-of-the-art reasoning models (e.g., o1-preview) while +simultaneously guaranteeing solution correctness. + +
+
+
+
+
+ + ♻ ☆ Beyond Single Concept Vector: Modeling Concept Subspace in LLMs with + Gaussian Distribution ICLR 2025 + + +
+ Probing learned concepts in large language models (LLMs) is crucial for +understanding how semantic knowledge is encoded internally. Training linear +classifiers on probing tasks is a principle approach to denote the vector of a +certain concept in the representation space. However, the single vector +identified for a concept varies with both data and training, making it less +robust and weakening its effectiveness in real-world applications. To address +this challenge, we propose an approach to approximate the subspace representing +a specific concept. Built on linear probing classifiers, we extend the concept +vectors into Gaussian Concept Subspace (GCS). We demonstrate GCS's +effectiveness through measuring its faithfulness and plausibility across +multiple LLMs with different sizes and architectures. Additionally, we use +representation intervention tasks to showcase its efficacy in real-world +applications such as emotion steering. Experimental results indicate that GCS +concept vectors have the potential to balance steering performance and +maintaining the fluency in natural language generation tasks. + +
+
+ comment: Accepted by ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Graph Neural Networks for Virtual Sensing in Complex Systems: Addressing + Heterogeneous Temporal Dynamics SP + + +
+ Real-time condition monitoring is crucial for the reliable and efficient +operation of complex systems. However, relying solely on physical sensors can +be limited due to their cost, placement constraints, or inability to directly +measure certain critical parameters. Virtual sensing addresses these +limitations by leveraging readily available sensor data and system knowledge to +estimate inaccessible parameters or infer system states. The increasing +complexity of industrial systems necessitates deployments of sensors with +diverse modalities to provide a comprehensive understanding of system states. +These sensors capture data at varying frequencies to monitor both rapid and +slowly varying system dynamics, as well as local and global state evolutions of +the systems. This leads to heterogeneous temporal dynamics, which, particularly +under varying operational end environmental conditions, pose a significant +challenge for accurate virtual sensing. To address this, we propose a +Heterogeneous Temporal Graph Neural Network (HTGNN) framework. HTGNN explicitly +models signals from diverse sensors and integrates operating conditions into +the model architecture. We evaluate HTGNN using two newly released datasets: a +bearing dataset with diverse load conditions for bearing load prediction and a +year-long simulated dataset for predicting bridge live loads. Our results +demonstrate that HTGNN significantly outperforms established baseline methods +in both tasks, particularly under highly varying operating conditions. These +results highlight HTGNN's potential as a robust and accurate virtual sensing +approach for complex systems, paving the way for improved monitoring, +predictive maintenance, and enhanced system performance. Our code and data are +available under https://github.com/EPFL-IMOS/htgnn. + +
+
+ comment: This paper extends our previous conference paper (Best Paper at + European Conference of the PHM Society 2024, + https://doi.org/10.36001/phme.2024.v8i1.3998). Accepted by Mechanical Systems + and Signal Processing (MSSP) +
+
+
+
+
+ + ♻ ☆ X-Boundary: Establishing Exact Safety Boundary to Shield LLMs from + Multi-Turn Jailbreaks without Compromising Usability + + +
+ Despite the rapid development of safety alignment techniques for LLMs, +defending against multi-turn jailbreaks is still a challenging task. In this +paper, we conduct a comprehensive comparison, revealing that some existing +defense methods can improve the robustness of LLMs against multi-turn +jailbreaks but compromise usability, i.e., reducing general capabilities or +causing the over-refusal problem. From the perspective of mechanism +interpretability of LLMs, we discover that these methods fail to establish a +boundary that exactly distinguishes safe and harmful feature representations. +Therefore, boundary-safe representations close to harmful representations are +inevitably disrupted, leading to a decline in usability. To address this issue, +we propose X-Boundary to push harmful representations away from boundary-safe +representations and obtain an exact distinction boundary. In this way, harmful +representations can be precisely erased without disrupting safe ones. +Experimental results show that X-Boundary achieves state-of-the-art defense +performance against multi-turn jailbreaks, while reducing the over-refusal rate +by about 20% and maintaining nearly complete general capability. Furthermore, +we theoretically prove and empirically verify that X-Boundary can accelerate +the convergence process during training. Please see our code at: +https://github.com/AI45Lab/X-Boundary. + +
+
+
+
+
+ + ♻ ☆ UoR-NCL at SemEval-2025 Task 1: Using Generative LLMs and CLIP Models + for Multilingual Multimodal Idiomaticity Representation + + +
+ SemEval-2025 Task 1 focuses on ranking images based on their alignment with a +given nominal compound that may carry idiomatic meaning in both English and +Brazilian Portuguese. To address this challenge, this work uses generative +large language models (LLMs) and multilingual CLIP models to enhance idiomatic +compound representations. LLMs generate idiomatic meanings for potentially +idiomatic compounds, enriching their semantic interpretation. These meanings +are then encoded using multilingual CLIP models, serving as representations for +image ranking. Contrastive learning and data augmentation techniques are +applied to fine-tune these embeddings for improved performance. Experimental +results show that multimodal representations extracted through this method +outperformed those based solely on the original nominal compounds. The +fine-tuning approach shows promising outcomes but is less effective than using +embeddings without fine-tuning. The source code used in this paper is available +at https://github.com/tongwu17/SemEval-2025-Task1-UoR-NCL. + +
+
+
+
+
+ + ♻ ☆ On the Challenges and Opportunities in Generative AI + + +
+ The field of deep generative modeling has grown rapidly in the last few +years. With the availability of massive amounts of training data coupled with +advances in scalable unsupervised learning paradigms, recent large-scale +generative models show tremendous promise in synthesizing high-resolution +images and text, as well as structured data such as videos and molecules. +However, we argue that current large-scale generative AI models exhibit several +fundamental shortcomings that hinder their widespread adoption across domains. +In this work, our objective is to identify these issues and highlight key +unresolved challenges in modern generative AI paradigms that should be +addressed to further enhance their capabilities, versatility, and reliability. +By identifying these challenges, we aim to provide researchers with insights +for exploring fruitful research directions, thus fostering the development of +more robust and accessible generative AI solutions. + +
+
+
+
+
+ + ♻ ☆ Gumbel Counterfactual Generation From Language Models ICLR 2025 + + +
+ Understanding and manipulating the causal generation mechanisms in language +models is essential for controlling their behavior. Previous work has primarily +relied on techniques such as representation surgery -- e.g., model ablations or +manipulation of linear subspaces tied to specific concepts -- to +\emph{intervene} on these models. To understand the impact of interventions +precisely, it is useful to examine \emph{counterfactuals} -- e.g., how a given +sentence would have appeared had it been generated by the model following a +specific intervention. We highlight that counterfactual reasoning is +conceptually distinct from interventions, as articulated in Pearl's causal +hierarchy. Based on this observation, we propose a framework for generating +true string counterfactuals by reformulating language models as a structural +equation model using the Gumbel-max trick, which we called Gumbel +counterfactual generation. This reformulation allows us to model the joint +distribution over original strings and their counterfactuals resulting from the +same instantiation of the sampling noise. We develop an algorithm based on +hindsight Gumbel sampling that allows us to infer the latent noise variables +and generate counterfactuals of observed strings. Our experiments demonstrate +that the approach produces meaningful counterfactuals while at the same time +showing that commonly used intervention techniques have considerable undesired +side effects. + +
+
+ comment: Accepted in ICLR 2025 +
+
+
+
+
+ + ♻ ☆ MMGDreamer: Mixed-Modality Graph for Geometry-Controllable 3D Indoor + Scene Generation AAAI 2025 + + +
+ Controllable 3D scene generation has extensive applications in virtual +reality and interior design, where the generated scenes should exhibit high +levels of realism and controllability in terms of geometry. Scene graphs +provide a suitable data representation that facilitates these applications. +However, current graph-based methods for scene generation are constrained to +text-based inputs and exhibit insufficient adaptability to flexible user +inputs, hindering the ability to precisely control object geometry. To address +this issue, we propose MMGDreamer, a dual-branch diffusion model for scene +generation that incorporates a novel Mixed-Modality Graph, visual enhancement +module, and relation predictor. The mixed-modality graph allows object nodes to +integrate textual and visual modalities, with optional relationships between +nodes. It enhances adaptability to flexible user inputs and enables meticulous +control over the geometry of objects in the generated scenes. The visual +enhancement module enriches the visual fidelity of text-only nodes by +constructing visual representations using text embeddings. Furthermore, our +relation predictor leverages node representations to infer absent relationships +between nodes, resulting in more coherent scene layouts. Extensive experimental +results demonstrate that MMGDreamer exhibits superior control of object +geometry, achieving state-of-the-art scene generation performance. Project +page: https://yangzhifeio.github.io/project/MMGDreamer. + +
+
+ comment: Accepted by AAAI 2025 Main Track +
+
+
+
+
+ + ♻ ☆ Pretrained Embeddings as a Behavior Specification Mechanism + + +
+ We propose an approach to formally specifying the behavioral properties of +systems that rely on a perception model for interactions with the physical +world. The key idea is to introduce embeddings -- mathematical representations +of a real-world concept -- as a first-class construct in a specification +language, where properties are expressed in terms of distances between a pair +of ideal and observed embeddings. To realize this approach, we propose a new +type of temporal logic called Embedding Temporal Logic (ETL), and describe how +it can be used to express a wider range of properties about AI-enabled systems +than previously possible. We demonstrate the applicability of ETL through a +preliminary evaluation involving planning tasks in robots that are driven by +foundation models; the results are promising, showing that embedding-based +specifications can be used to steer a system towards desirable behaviors. + +
+
+ comment: 18 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Decoupled Recommender Systems: Exploring Alternative Recommender + Ecosystem Designs + + +
+ Recommender ecosystems are an emerging subject of research. Such research +examines how the characteristics of algorithms, recommendation consumers, and +item providers influence system dynamics and long-term outcomes. One +architectural possibility that has not yet been widely explored in this line of +research is the consequences of a configuration in which recommendation +algorithms are decoupled from the platforms they serve. This is sometimes +called "the friendly neighborhood algorithm store" or "middleware" model. We +are particularly interested in how such architectures might offer a range of +different distributions of utility across consumers, providers, and +recommendation platforms. In this paper, we create a model of a recommendation +ecosystem that incorporates algorithm choice and examine the outcomes of such a +design. + +
+
+
+
+
+ + ♻ ☆ MobileViM: A Light-weight and Dimension-independent Vision Mamba for 3D + Medical Image Analysis + + +
+ Efficient evaluation of three-dimensional (3D) medical images is crucial for +diagnostic and therapeutic practices in healthcare. Recent years have seen a +substantial uptake in applying deep learning and computer vision to analyse and +interpret medical images. Traditional approaches, such as convolutional neural +networks (CNNs) and vision transformers (ViTs), face significant computational +challenges, prompting the need for architectural advancements. Recent efforts +have led to the introduction of novel architectures like the ``Mamba'' model as +alternative solutions to traditional CNNs or ViTs. The Mamba model excels in +the linear processing of one-dimensional data with low computational demands. +However, Mamba's potential for 3D medical image analysis remains underexplored +and could face significant computational challenges as the dimension increases. +This manuscript presents MobileViM, a streamlined architecture for efficient +segmentation of 3D medical images. In the MobileViM network, we invent a new +dimension-independent mechanism and a dual-direction traversing approach to +incorporate with a vision-Mamba-based framework. MobileViM also features a +cross-scale bridging technique to improve efficiency and accuracy across +various medical imaging modalities. With these enhancements, MobileViM achieves +segmentation speeds exceeding 90 frames per second (FPS) on a single graphics +processing unit (i.e., NVIDIA RTX 4090). This performance is over 24 FPS faster +than the state-of-the-art deep learning models for processing 3D images with +the same computational resources. In addition, experimental evaluations +demonstrate that MobileViM delivers superior performance, with Dice similarity +scores reaching 92.72%, 86.69%, 80.46%, and 77.43% for PENGWIN, BraTS2024, +ATLAS, and Toothfairy2 datasets, respectively, which significantly surpasses +existing models. + +
+
+ comment: The corresponding author disagrees with the manuscript submitted to + arXiv +
+
+
+
+
+ + ♻ ☆ Secure Federated Data Distillation + + +
+ Dataset Distillation (DD) is a powerful technique for reducing large datasets +into compact, representative synthetic datasets, accelerating Machine Learning +training. However, traditional DD methods operate in a centralized manner, +which poses significant privacy threats and reduces its applicability. To +mitigate these risks, we propose a Secure Federated Data Distillation (SFDD) +framework to decentralize the distillation process while preserving privacy. +Unlike existing Federated Distillation techniques that focus on training global +models with distilled knowledge, our approach aims to produce a distilled +dataset without exposing local contributions. We leverage the +gradient-matching-based distillation method, adapting it for a distributed +setting where clients contribute to the distillation process without sharing +raw data. The central aggregator iteratively refines a synthetic dataset by +integrating client-side updates while ensuring data confidentiality. To make +our approach resilient to inference attacks perpetrated by the server that +could exploit gradient updates to reconstruct private data, we create an +optimized Local Differential Privacy approach, called LDPO-RLD. Furthermore, we +assess the framework's resilience against malicious clients executing backdoor +attacks (such as Doorping) and demonstrate robustness under the assumption of a +sufficient number of participating clients. Our experimental results +demonstrate the effectiveness of SFDD and that the proposed defense concretely +mitigates the identified vulnerabilities, with minimal impact on the +performance of the distilled dataset. By addressing the interplay between +privacy and federation in dataset distillation, this work advances the field of +privacy-preserving Machine Learning making our SFDD framework a viable solution +for sensitive data-sharing applications. + +
+
+
+
+
+ + ♻ ☆ Which Frequencies do CNNs Need? Emergent Bottleneck Structure in Feature + Learning + + +
+ We describe the emergence of a Convolution Bottleneck (CBN) structure in +CNNs, where the network uses its first few layers to transform the input +representation into a representation that is supported only along a few +frequencies and channels, before using the last few layers to map back to the +outputs. We define the CBN rank, which describes the number and type of +frequencies that are kept inside the bottleneck, and partially prove that the +parameter norm required to represent a function $f$ scales as depth times the +CBN rank $f$. We also show that the parameter norm depends at next order on the +regularity of $f$. We show that any network with almost optimal parameter norm +will exhibit a CBN structure in both the weights and - under the assumption +that the network is stable under large learning rate - the activations, which +motivates the common practice of down-sampling; and we verify that the CBN +results still hold with down-sampling. Finally we use the CBN structure to +interpret the functions learned by CNNs on a number of tasks. + +
+
+
+
+
+ + ♻ ☆ Assisting Mathematical Formalization with A Learning-based Premise + Retriever + + +
+ Premise selection is a crucial yet challenging step in mathematical +formalization, especially for users with limited experience. Due to the lack of +available formalization projects, existing approaches that leverage language +models often suffer from data scarcity. In this work, we introduce an +innovative method for training a premise retriever to support the formalization +of mathematics. Our approach employs a BERT model to embed proof states and +premises into a shared latent space. The retrieval model is trained within a +contrastive learning framework and incorporates a domain-specific tokenizer +along with a fine-grained similarity computation method. Experimental results +show that our model is highly competitive compared to existing baselines, +achieving strong performance while requiring fewer computational resources. +Performance is further enhanced through the integration of a re-ranking module. +To streamline the formalization process, we will release a search engine that +enables users to query Mathlib theorems directly using proof states, +significantly improving accessibility and efficiency. Codes are available at +https://github.com/ruc-ai4math/Premise-Retrieval. + +
+
+
+
+
+ + ♻ ☆ Hamiltonian Mechanics of Feature Learning: Bottleneck Structure in Leaky + ResNets + + +
+ We study Leaky ResNets, which interpolate between ResNets and Fully-Connected +nets depending on an 'effective depth' hyper-parameter $\tilde{L}$. In the +infinite depth limit, we study 'representation geodesics' $A_{p}$: continuous +paths in representation space (similar to NeuralODEs) from input $p=0$ to +output $p=1$ that minimize the parameter norm of the network. We give a +Lagrangian and Hamiltonian reformulation, which highlight the importance of two +terms: a kinetic energy which favors small layer derivatives +$\partial_{p}A_{p}$ and a potential energy that favors low-dimensional +representations, as measured by the 'Cost of Identity'. The balance between +these two forces offers an intuitive understanding of feature learning in +ResNets. We leverage this intuition to explain the emergence of a bottleneck +structure, as observed in previous work: for large $\tilde{L}$ the potential +energy dominates and leads to a separation of timescales, where the +representation jumps rapidly from the high dimensional inputs to a +low-dimensional representation, move slowly inside the space of low-dimensional +representations, before jumping back to the potentially high-dimensional +outputs. Inspired by this phenomenon, we train with an adaptive layer step-size +to adapt to the separation of timescales. + +
+
+
+
+
+ + ♻ ☆ How DNNs break the Curse of Dimensionality: Compositionality and + Symmetry Learning + + +
+ We show that deep neural networks (DNNs) can efficiently learn any +composition of functions with bounded $F_{1}$-norm, which allows DNNs to break +the curse of dimensionality in ways that shallow networks cannot. More +specifically, we derive a generalization bound that combines a covering number +argument for compositionality, and the $F_{1}$-norm (or the related Barron +norm) for large width adaptivity. We show that the global minimizer of the +regularized loss of DNNs can fit for example the composition of two functions +$f^{*}=h\circ g$ from a small number of observations, assuming $g$ is +smooth/regular and reduces the dimensionality (e.g. $g$ could be the quotient +map of the symmetries of $f^{*}$), so that $h$ can be learned in spite of its +low regularity. The measures of regularity we consider is the Sobolev norm with +different levels of differentiability, which is well adapted to the $F_{1}$ +norm. We compute scaling laws empirically and observe phase transitions +depending on whether $g$ or $h$ is harder to learn, as predicted by our theory. + +
+
+
+
+
+ + ♻ ☆ CATCH: Channel-Aware multivariate Time Series Anomaly Detection via + Frequency Patching ICLR 2025 + + +
+ Anomaly detection in multivariate time series is challenging as heterogeneous +subsequence anomalies may occur. Reconstruction-based methods, which focus on +learning normal patterns in the frequency domain to detect diverse abnormal +subsequences, achieve promising results, while still falling short on capturing +fine-grained frequency characteristics and channel correlations. To contend +with the limitations, we introduce CATCH, a framework based on frequency +patching. We propose to patchify the frequency domain into frequency bands, +which enhances its ability to capture fine-grained frequency characteristics. +To perceive appropriate channel correlations, we propose a Channel Fusion +Module (CFM), which features a patch-wise mask generator and a masked-attention +mechanism. Driven by a bi-level multi-objective optimization algorithm, the CFM +is encouraged to iteratively discover appropriate patch-wise channel +correlations, and to cluster relevant channels while isolating adverse effects +from irrelevant channels. Extensive experiments on 10 real-world datasets and +12 synthetic datasets demonstrate that CATCH achieves state-of-the-art +performance. We make our code and datasets available at +https://github.com/decisionintelligence/CATCH. + +
+
+ comment: Accepted by ICLR 2025 +
+
+
+
+
+ + ♻ ☆ AfroBench: How Good are Large Language Models on African Languages? + + +
+ Large-scale multilingual evaluations, such as MEGA, often include only a +handful of African languages due to the scarcity of high-quality evaluation +data and the limited discoverability of existing African datasets. This lack of +representation hinders comprehensive LLM evaluation across a diverse range of +languages and tasks. To address these challenges, we introduce AfroBench -- a +multi-task benchmark for evaluating the performance of LLMs across 64 African +languages, 15 tasks and 22 datasets. AfroBench consists of nine natural +language understanding datasets, six text generation datasets, six knowledge +and question answering tasks, and one mathematical reasoning task. We present +results comparing the performance of prompting LLMs to fine-tuned baselines +based on BERT and T5-style models. Our results suggest large gaps in +performance between high-resource languages, such as English, and African +languages across most tasks; but performance also varies based on the +availability of monolingual data resources. Our findings confirm that +performance on African languages continues to remain a hurdle for current LLMs, +underscoring the need for additional efforts to close this gap. + https://mcgill-nlp.github.io/AfroBench/ + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ OlympicArena: Benchmarking Multi-discipline Cognitive Reasoning for + Superintelligent AI NeurIPS 2024 + + +
+ The evolution of Artificial Intelligence (AI) has been significantly +accelerated by advancements in Large Language Models (LLMs) and Large +Multimodal Models (LMMs), gradually showcasing potential cognitive reasoning +abilities in problem-solving and scientific discovery (i.e., AI4Science) once +exclusive to human intellect. To comprehensively evaluate current models' +performance in cognitive reasoning abilities, we introduce OlympicArena, which +includes 11,163 bilingual problems across both text-only and interleaved +text-image modalities. These challenges encompass a wide range of disciplines +spanning seven fields and 62 international Olympic competitions, rigorously +examined for data leakage. We argue that the challenges in Olympic competition +problems are ideal for evaluating AI's cognitive reasoning due to their +complexity and interdisciplinary nature, which are essential for tackling +complex scientific challenges and facilitating discoveries. Beyond evaluating +performance across various disciplines using answer-only criteria, we conduct +detailed experiments and analyses from multiple perspectives. We delve into the +models' cognitive reasoning abilities, their performance across different +modalities, and their outcomes in process-level evaluations, which are vital +for tasks requiring complex reasoning with lengthy solutions. Our extensive +evaluations reveal that even advanced models like GPT-4o only achieve a 39.97% +overall accuracy, illustrating current AI limitations in complex reasoning and +multimodal integration. Through the OlympicArena, we aim to advance AI towards +superintelligence, equipping it to address more complex challenges in science +and beyond. We also provide a comprehensive set of resources to support AI +research, including a benchmark dataset, an open-source annotation platform, a +detailed evaluation tool, and a leaderboard with automatic submission features. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ 360$^\circ$REA: Towards A Reusable Experience Accumulation with + 360° Assessment for Multi-Agent System + + +
+ Large language model agents have demonstrated remarkable advancements across +various complex tasks. Recent works focus on optimizing the agent team or +employing self-reflection to iteratively solve complex tasks. Since these +agents are all based on the same LLM, only conducting self-evaluation or +removing underperforming agents does not substantively enhance the capability +of the agents. We argue that a comprehensive evaluation and accumulating +experience from evaluation feedback is an effective approach to improving +system performance. In this paper, we propose Reusable Experience Accumulation +with 360$^\circ$ Assessment (360$^\circ$REA), a hierarchical multi-agent +framework inspired by corporate organizational practices. The framework employs +a novel 360$^\circ$ performance assessment method for multi-perspective +performance evaluation with fine-grained assessment. To enhance the capability +of agents in addressing complex tasks, we introduce dual-level experience pool +for agents to accumulate experience through fine-grained assessment. Extensive +experiments on complex task datasets demonstrate the effectiveness of +360$^\circ$REA. + +
+
+
+
+
+ + ♻ ☆ Structured Preference Optimization for Vision-Language Long-Horizon Task + Planning + + +
+ Existing methods for vision-language task planning excel in short-horizon +tasks but often fall short in complex, long-horizon planning within dynamic +environments. These challenges primarily arise from the difficulty of +effectively training models to produce high-quality reasoning processes for +long-horizon tasks. To address this, we propose Structured Preference +Optimization (SPO), which aims to enhance reasoning and action selection in +long-horizon task planning through structured preference evaluation and +optimized training strategies. Specifically, SPO introduces: 1) +Preference-Based Scoring and Optimization, which systematically evaluates +reasoning chains based on task relevance, visual grounding, and historical +consistency; and 2) Curriculum-Guided Training, where the model progressively +adapts from simple to complex tasks, improving its generalization ability in +long-horizon scenarios and enhancing reasoning robustness. To advance research +in vision-language long-horizon task planning, we introduce ExtendaBench, a +comprehensive benchmark covering 1,509 tasks across VirtualHome and Habitat +2.0, categorized into ultra-short, short, medium, and long tasks. Experimental +results demonstrate that SPO significantly improves reasoning quality and final +decision accuracy, outperforming prior methods on long-horizon tasks and +underscoring the effectiveness of preference-driven optimization in +vision-language task planning. Specifically, SPO achieves a +5.98% GCR and ++4.68% SR improvement in VirtualHome and a +3.30% GCR and +2.11% SR improvement +in Habitat over the best-performing baselines. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ♻ ☆ Detecting new obfuscated malware variants: A lightweight and + interpretable machine learning approach + + +
+ Machine learning has been successfully applied in developing malware +detection systems, with a primary focus on accuracy, and increasing attention +to reducing computational overhead and improving model interpretability. +However, an important question remains underexplored: How well can machine +learning-based models detect entirely new forms of malware not present in the +training data? In this study, we present a machine learning-based system for +detecting obfuscated malware that is not only highly accurate, lightweight and +interpretable, but also capable of successfully adapting to new types of +malware attacks. Our system is capable of detecting 15 malware subtypes despite +being exclusively trained on one malware subtype, namely the Transponder from +the Spyware family. This system was built after training 15 distinct random +forest-based models, each on a different malware subtype from the +CIC-MalMem-2022 dataset. These models were evaluated against the entire range +of malware subtypes, including all unseen malware subtypes. To maintain the +system's streamlined nature, training was confined to the top five most +important features, which also enhanced interpretability. The +Transponder-focused model exhibited high accuracy, exceeding 99.8%, with an +average processing speed of 5.7 microseconds per file. We also illustrate how +the Shapley additive explanations technique can facilitate the interpretation +of the model predictions. Our research contributes to advancing malware +detection methodologies, pioneering the feasibility of detecting obfuscated +malware by exclusively training a model on a single or a few carefully selected +malware subtypes and applying it to detect unseen subtypes. + +
+
+ comment: 30 pages (excluding Appendix), 5 figures and 5 tables. Now published + in Intelligent Systems with Applications + (https://doi.org/10.1016/j.iswa.2024.200472) +
+
+
+
+
+ + ♻ ☆ Stealthy Jailbreak Attacks on Large Language Models via Benign Data + Mirroring NAACL 2025 + + +
+ Large language model (LLM) safety is a critical issue, with numerous studies +employing red team testing to enhance model security. Among these, jailbreak +methods explore potential vulnerabilities by crafting malicious prompts that +induce model outputs contrary to safety alignments. Existing black-box +jailbreak methods often rely on model feedback, repeatedly submitting queries +with detectable malicious instructions during the attack search process. +Although these approaches are effective, the attacks may be intercepted by +content moderators during the search process. We propose an improved transfer +attack method that guides malicious prompt construction by locally training a +mirror model of the target black-box model through benign data distillation. +This method offers enhanced stealth, as it does not involve submitting +identifiable malicious instructions to the target model during the search +phase. Our approach achieved a maximum attack success rate of 92%, or a +balanced value of 80% with an average of 1.5 detectable jailbreak queries per +sample against GPT-3.5 Turbo on a subset of AdvBench. These results underscore +the need for more robust defense mechanisms. + +
+
+ comment: Accepted by NAACL 2025 +
+
+
+
+
+ + ♻ ☆ Nature Language Model: Deciphering the Language of Nature for Scientific + Discovery + + +
+ Foundation models have revolutionized natural language processing and +artificial intelligence, significantly enhancing how machines comprehend and +generate human languages. Inspired by the success of these foundation models, +researchers have developed foundation models for individual scientific domains, +including small molecules, materials, proteins, DNA, RNA and even cells. +However, these models are typically trained in isolation, lacking the ability +to integrate across different scientific domains. Recognizing that entities +within these domains can all be represented as sequences, which together form +the "language of nature", we introduce Nature Language Model (NatureLM), a +sequence-based science foundation model designed for scientific discovery. +Pre-trained with data from multiple scientific domains, NatureLM offers a +unified, versatile model that enables various applications including: (i) +generating and optimizing small molecules, proteins, RNA, and materials using +text instructions; (ii) cross-domain generation/design, such as +protein-to-molecule and protein-to-RNA generation; and (iii) top performance +across different domains, matching or surpassing state-of-the-art specialist +models. NatureLM offers a promising generalist approach for various scientific +tasks, including drug discovery (hit generation/optimization, ADMET +optimization, synthesis), novel material design, and the development of +therapeutic proteins or nucleotides. We have developed NatureLM models in +different sizes (1 billion, 8 billion, and 46.7 billion parameters) and +observed a clear improvement in performance as the model size increases. + +
+
+ comment: 93 pages +
+
+
+
+
+ + ♻ ☆ InfoDisent: Explainability of Image Classification Models by Information + Disentanglement + + +
+ In this work, we introduce InfoDisent, a hybrid approach to explainability +based on the information bottleneck principle. InfoDisent enables the +disentanglement of information in the final layer of any pretrained model into +atomic concepts, which can be interpreted as prototypical parts. This approach +merges the flexibility of post-hoc methods with the concept-level modeling +capabilities of self-explainable neural networks, such as ProtoPNets. We +demonstrate the effectiveness of InfoDisent through computational experiments +and user studies across various datasets using modern backbones such as ViTs +and convolutional networks. Notably, InfoDisent generalizes the prototypical +parts approach to novel domains (ImageNet). + +
+
+
+
+
+ + ♻ ☆ HelpSteer2-Preference: Complementing Ratings with Preferences ICLR 2025 + + +
+ Reward models are critical for aligning models to follow instructions, and +are typically trained following one of two popular paradigms: Bradley-Terry +style or Regression style. However, there is a lack of evidence that either +approach is better than the other, when adequately matched for data. This is +primarily because these approaches require data collected in different (but +incompatible) formats, meaning that adequately matched data is not available in +existing public datasets. To tackle this problem, we release preference +annotations (designed for Bradley-Terry training) to complement existing +ratings (designed for Regression style training) in the HelpSteer2 dataset. To +improve data interpretability, preference annotations are accompanied with +human-written justifications. Using this data, we conduct the first +head-to-head comparison of Bradley-Terry and Regression models when adequately +matched for data. Based on insights derived from such a comparison, we propose +a novel approach to combine Bradley-Terry and Regression reward modeling. A +Llama-3.1-70B-Instruct model tuned with this approach scores 94.1 on +RewardBench, emerging top of more than 140 reward models as of 1 Oct 2024. This +reward model can then be used with REINFORCE algorithm (RLHF) to align an +Instruct model to reach 85.0 on Arena Hard, which is No. 1 as of 1 Oct 2024. We +open-source this dataset (CC-BY-4.0 license) at +https://huggingface.co/datasets/nvidia/HelpSteer2#preferences-new -- 1-oct-2024 +and openly release the trained Reward and Instruct models at +https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Reward and +https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct + +
+
+ comment: Accepted to ICLR 2025; 28 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ A Backbone for Long-Horizon Robot Task Understanding + + +
+ End-to-end robot learning, particularly for long-horizon tasks, often results +in unpredictable outcomes and poor generalization. To address these challenges, +we propose a novel Therblig-Based Backbone Framework (TBBF) as a fundamental +structure to enhance interpretability, data efficiency, and generalization in +robotic systems. TBBF utilizes expert demonstrations to enable therblig-level +task decomposition, facilitate efficient action-object mapping, and generate +adaptive trajectories for new scenarios. The approach consists of two stages: +offline training and online testing. During the offline training stage, we +developed the Meta-RGate SynerFusion (MGSF) network for accurate therblig +segmentation across various tasks. In the online testing stage, after a +one-shot demonstration of a new task is collected, our MGSF network extracts +high-level knowledge, which is then encoded into the image using Action +Registration (ActionREG). Additionally, Large Language Model (LLM)-Alignment +Policy for Visual Correction (LAP-VC) is employed to ensure precise action +registration, facilitating trajectory transfer in novel robot scenarios. +Experimental results validate these methods, achieving 94.37% recall in +therblig segmentation and success rates of 94.4% and 80% in real-world online +robot testing for simple and complex scenarios, respectively. Supplementary +material is available at: +https://sites.google.com/view/therbligsbasedbackbone/home + +
+
+ comment: 8 pages, 8 figures. This work has been published by IEEE Robotics and + Automation Letters (RA-L) +
+
+
+
+
+ + ♻ ☆ Evaluating Search Engines and Large Language Models for Answering Health + Questions + + +
+ Search engines (SEs) have traditionally been primary tools for information +seeking, but the new Large Language Models (LLMs) are emerging as powerful +alternatives, particularly for question-answering tasks. This study compares +the performance of four popular SEs, seven LLMs, and retrieval-augmented (RAG) +variants in answering 150 health-related questions from the TREC Health +Misinformation (HM) Track. Results reveal SEs correctly answer between 50 and +70% of questions, often hindered by many retrieval results not responding to +the health question. LLMs deliver higher accuracy, correctly answering about +80% of questions, though their performance is sensitive to input prompts. RAG +methods significantly enhance smaller LLMs' effectiveness, improving accuracy +by up to 30% by integrating retrieval evidence. + +
+
+
+
+
+ + ♻ ☆ Extracting Formulae in Many-Valued Logic from Deep Neural Networks + + +
+ We propose a new perspective on deep ReLU networks, namely as circuit +counterparts of Lukasiewicz infinite-valued logic -- a many-valued (MV) +generalization of Boolean logic. An algorithm for extracting formulae in MV +logic from deep ReLU networks is presented. As the algorithm applies to +networks with general, in particular also real-valued, weights, it can be used +to extract logical formulae from deep ReLU networks trained on data. + +
+
+ comment: Signicant extension of the previous version +
+
+
+
+
+ + ♻ ☆ VISION-XL: High Definition Video Inverse Problem Solver using Latent + Image Diffusion Models + + +
+ In this paper, we propose a novel framework for solving high-definition video +inverse problems using latent image diffusion models. Building on recent +advancements in spatio-temporal optimization for video inverse problems using +image diffusion models, our approach leverages latent-space diffusion models to +achieve enhanced video quality and resolution. To address the high +computational demands of processing high-resolution frames, we introduce a +pseudo-batch consistent sampling strategy, allowing efficient operation on a +single GPU. Additionally, to improve temporal consistency, we present +pseudo-batch inversion, an initialization technique that incorporates +informative latents from the measurement. By integrating with SDXL, our +framework achieves state-of-the-art video reconstruction across a wide range of +spatio-temporal inverse problems, including complex combinations of frame +averaging and various spatial degradations, such as deblurring, +super-resolution, and inpainting. Unlike previous methods, our approach +supports multiple aspect ratios (landscape, vertical, and square) and delivers +HD-resolution reconstructions (exceeding 1280x720) in under 6 seconds per frame +on a single NVIDIA 4090 GPU. + +
+
+ comment: Project page: https://vision-xl.github.io/ +
+
+
+
+
+ + ♻ ☆ No More Sliding Window: Efficient 3D Medical Image Segmentation with + Differentiable Top-k Patch Sampling + + +
+ 3D models surpass 2D models in CT/MRI segmentation by effectively capturing +inter-slice relationships. However, the added depth dimension substantially +increases memory consumption. While patch-based training alleviates memory +constraints, it significantly slows down the inference speed due to the sliding +window (SW) approach. We propose No-More-Sliding-Window (NMSW), a novel +end-to-end trainable framework that enhances the efficiency of generic 3D +segmentation backbone during an inference step by eliminating the need for SW. +NMSW employs a differentiable Top-k module to selectively sample only the most +relevant patches, thereby minimizing redundant computations. When patch-level +predictions are insufficient, the framework intelligently leverages coarse +global predictions to refine results. Evaluated across 3 tasks using 3 +segmentation backbones, NMSW achieves competitive accuracy compared to SW +inference while significantly reducing computational complexity by 91% (88.0 to +8.00 TMACs). Moreover, it delivers a 9.1x faster inference on the H100 GPU +(99.0 to 8.3 sec) and a 11.1x faster inference on the Xeon Gold CPU (2110 to +189 sec). NMSW is model-agnostic, further boosting efficiency when integrated +with any existing efficient segmentation backbones. + +
+
+
+
+
+ + ♻ ☆ Robust Deterministic Policy Gradient for Disturbance Attenuation and Its + Application to Quadrotor Control + + +
+ Practical control systems pose significant challenges in identifying optimal +control policies due to uncertainties in the system model and external +disturbances. While $H_\infty$ control techniques are commonly used to design +robust controllers that mitigate the effects of disturbances, these methods +often require complex and computationally intensive calculations. To address +this issue, this paper proposes a reinforcement learning algorithm called +Robust Deterministic Policy Gradient (RDPG), which formulates the $H_\infty$ +control problem as a two-player zero-sum dynamic game. In this formulation, one +player (the user) aims to minimize the cost, while the other player (the +adversary) seeks to maximize it. We then employ deterministic policy gradient +(DPG) and its deep reinforcement learning counterpart to train a robust control +policy with effective disturbance attenuation. In particular, for practical +implementation, we introduce an algorithm called robust deep deterministic +policy gradient (RDDPG), which employs a deep neural network architecture and +integrates techniques from the twin-delayed deep deterministic policy gradient +(TD3) to enhance stability and learning efficiency. To evaluate the proposed +algorithm, we implement it on an unmanned aerial vehicle (UAV) tasked with +following a predefined path in a disturbance-prone environment. The +experimental results demonstrate that the proposed method outperforms other +control approaches in terms of robustness against disturbances, enabling +precise real-time tracking of moving targets even under severe disturbance +conditions. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ When Claims Evolve: Evaluating and Enhancing the Robustness of Embedding + Models Against Misinformation Edits + + +
+ Online misinformation remains a critical challenge, and fact-checkers +increasingly rely on embedding-based methods to retrieve relevant fact-checks. +Yet, when debunked claims reappear in edited forms, the performance of these +methods is unclear. In this work, we introduce a taxonomy of six common +real-world misinformation edits and propose a perturbation framework that +generates valid, natural claim variations. Our multi-stage retrieval evaluation +reveals that standard embedding models struggle with user-introduced edits, +while LLM-distilled embeddings offer improved robustness at a higher +computational cost. Although a strong reranker helps mitigate some issues, it +cannot fully compensate for first-stage retrieval gaps. Addressing these +retrieval gaps, our train- and inference-time mitigation approaches enhance +in-domain robustness by up to 17 percentage points and boost out-of-domain +generalization by 10 percentage points over baseline models. Overall, our +findings provide practical improvements to claim-matching systems, enabling +more reliable fact-checking of evolving misinformation. + +
+
+
+
+
+ + ♻ ☆ Semi-Parametric Retrieval via Binary Bag-of-Tokens Index + + +
+ Information retrieval has transitioned from standalone systems into essential +components across broader applications, with indexing efficiency, +cost-effectiveness, and freshness becoming increasingly critical yet often +overlooked. In this paper, we introduce SemI-parametric Disentangled Retrieval +(SiDR), a bi-encoder retrieval framework that decouples retrieval index from +neural parameters to enable efficient, low-cost, and parameter-agnostic +indexing for emerging use cases. Specifically, in addition to using embeddings +as indexes like existing neural retrieval methods, SiDR supports a +non-parametric tokenization index for search, achieving BM25-like indexing +complexity with significantly better effectiveness. Our comprehensive +evaluation across 16 retrieval benchmarks demonstrates that SiDR outperforms +both neural and term-based retrieval baselines under the same indexing +workload: (i) When using an embedding-based index, SiDR exceeds the performance +of conventional neural retrievers while maintaining similar training +complexity; (ii) When using a tokenization-based index, SiDR drastically +reduces indexing cost and time, matching the complexity of traditional +term-based retrieval, while consistently outperforming BM25 on all in-domain +datasets; (iii) Additionally, we introduce a late parametric mechanism that +matches BM25 index preparation time while outperforming other neural retrieval +baselines in effectiveness. + +
+
+
+
+
+ + ♻ ☆ Prompt-Matcher: Leveraging Large Models to Reduce Uncertainty in Schema + Matching Results + + +
+ Schema matching is the process of identifying correspondences between the +elements of two given schemata, essential for database management systems, data +integration, and data warehousing. For datasets across different scenarios, the +optimal schema matching algorithm is different. For single algorithm, +hyperparameter tuning also cases multiple results. All results assigned equal +probabilities are stored in probabilistic databases to facilitate uncertainty +management. The substantial degree of uncertainty diminishes the efficiency and +reliability of data processing, thereby precluding the provision of more +accurate information for decision-makers. To address this problem, we introduce +a new approach based on fine-grained correspondence verification with specific +prompt of Large Language Model. + Our approach is an iterative loop that consists of three main components: (1) +the correspondence selection algorithm, (2) correspondence verification, and +(3) the update of probability distribution. The core idea is that +correspondences intersect across multiple results, thereby linking the +verification of correspondences to the reduction of uncertainty in candidate +results. + The task of selecting an optimal correspondence set to maximize the +anticipated uncertainty reduction within a fixed budgetary framework is +established as an NP-hard problem. We propose a novel $(1-1/e)$-approximation +algorithm that significantly outperforms brute algorithm in terms of +computational efficiency. To enhance correspondence verification, we have +developed two prompt templates that enable GPT-4 to achieve state-of-the-art +performance across two established benchmark datasets. Our comprehensive +experimental evaluation demonstrates the superior effectiveness and robustness +of the proposed approach. + +
+
+
+
+
+ + ♻ ☆ StoryTeller: Improving Long Video Description through Global + Audio-Visual Character Identification + + +
+ Existing large vision-language models (LVLMs) are largely limited to +processing short, seconds-long videos and struggle with generating coherent +descriptions for extended video spanning minutes or more. Long video +description introduces new challenges, such as consistent character +identification and plot-level descriptions incorporating both visual and audio +information. To address these, we figure out audio-visual character +identification, matching character names to each dialogue, as a key factor. We +propose StoryTeller, a system for generating dense descriptions of long videos, +incorporating both low-level visual concepts and high-level plot information. +StoryTeller uses a multimodal large language model that integrates visual, +audio, and text modalities to perform audio-visual character identification on +minute-long video clips. The results are then fed into a LVLM to enhance +consistency of video description. We validate our approach on movie description +tasks and introduce MovieStory101, a dataset with dense descriptions for +three-minute movie clips. To evaluate long video descriptions, we create +StoryQA, a large set of multiple-choice questions for MovieStory101 test set. +We assess descriptions by inputting them into GPT-4 to answer these questions, +using accuracy as an automatic evaluation metric. Experiments show that +StoryTeller outperforms all open and closed-source baselines on StoryQA, +achieving 9.5% higher accuracy than the strongest baseline, Gemini-1.5-pro, and +demonstrating a +15.56% advantage in human side-by-side evaluations. +Additionally, incorporating audio-visual character identification from +StoryTeller improves the performance of all video description models, with +Gemini-1.5-pro and GPT-4o showing relative improvement of 5.5% and 13.0%, +respectively, in accuracy on StoryQA. + +
+
+
+
+
+ + ♻ ☆ Explaining Caption-Image Interactions in CLIP models with Second-Order + Attributions + + +
+ Dual encoder architectures like CLIP models map two types of inputs into a +shared embedding space and predict similarities between them. Despite their +success, it is, however, not understood how these models compare their two +inputs. Common first-order feature-attribution methods can only provide limited +insights into dual-encoders since their predictions depend on +feature-interactions rather than on individual features. In this paper, we +first derive a second-order method enabling the attribution of predictions by +any differentiable dual encoder onto feature-interactions between its inputs. +Second, we apply our method to CLIP models and show that they learn +fine-grained correspondences between parts of captions and regions in images. +They match objects across input modes also account for mismatches. This +visual-linguistic grounding ability, however, varies heavily between object +classes and exhibits pronounced out-of-domain effects. We can identify +individual errors as well as systematic failure categories including object +coverage, unusual scenes and correlated contexts. + +
+
+
+
+
+ + ♻ ☆ Union of Experts: Adapting Hierarchical Routing to Equivalently + Decomposed Transformer + + +
+ We propose Union-of-Experts (UoE), which decomposes transformer into an +equitant group of experts, and then implement selective routing on input data +and experts. Our approach advances MoE design with four key innovations: (1) We +conducted equitant expert decomposition on both MLP blocks and attention blocks +based on matrix partition in tensor parallelism. (2) We developed two routing +paradigms: patch-wise data selection and expert selection, to apply routing +across different levels. (3) We design the architecture of UoE model, including +Selective Multi-Head Attention (SMHA) and Union-of-MLP-Experts (UoME). (4) We +develop parallel implementation of UoE's routing and computation operation, and +optimize efficiency based on the hardware processing analysis. The experiments +demonstrate that the UoE model surpass Full Attention, state-of-art MoEs and +efficient transformers (including the model architecture of recently proposed +DeepSeek-V3) in several tasks across image and natural language domains. In +language modeling tasks, we achieve an average reduction of 2.38 in perplexity +compared to the best-performed MoE method with an average of 76% FLOPs. In Long +Range Arena benchmark, we recorded an average score that is at least 0.68% +higher than all comparison models including Full Attention, MoEs, and +transformer variants, with only 50% FLOPs of the best MoE method. In image +classification, our model yielded an average accuracy improvement of 1.75% than +the best model while maintaining comparable FLOPs. The source codes are +available at https://github.com/YujiaoYang-work/UoE. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ♻ ☆ Pap2Pat: Benchmarking Outline-Guided Long-Text Patent Generation with + Patent-Paper Pairs + + +
+ Dealing with long and highly complex technical text is a challenge for Large +Language Models (LLMs), which still have to unfold their potential in +supporting expensive and timeintensive processes like patent drafting. Within +patents, the description constitutes more than 90% of the document on average. +Yet, its automatic generation remains understudied. When drafting patent +applications, patent attorneys typically receive invention reports (IRs), which +are usually confidential, hindering research on LLM-supported patent drafting. +Often, prepublication research papers serve as IRs. We leverage this duality to +build PAP2PAT, an open and realistic benchmark for patent drafting consisting +of 1.8k patent-paper pairs describing the same inventions. To address the +complex longdocument patent generation task, we propose chunk-based +outline-guided generation using the research paper as invention specification. +Our extensive evaluation using PAP2PAT and a human case study show that LLMs +can effectively leverage information from the paper, but still struggle to +provide the necessary level of detail. Fine-tuning leads to more patent-style +language, but also to more hallucination. We release our data and code +https://github.com/boschresearch/Pap2Pat. + +
+
+
+
+
+ + ♻ ☆ Measuring Human and AI Values Based on Generative Psychometrics with + Large Language Models AAAI 2025 + + +
+ Human values and their measurement are long-standing interdisciplinary +inquiry. Recent advances in AI have sparked renewed interest in this area, with +large language models (LLMs) emerging as both tools and subjects of value +measurement. This work introduces Generative Psychometrics for Values (GPV), an +LLM-based, data-driven value measurement paradigm, theoretically grounded in +text-revealed selective perceptions. The core idea is to dynamically parse +unstructured texts into perceptions akin to static stimuli in traditional +psychometrics, measure the value orientations they reveal, and aggregate the +results. Applying GPV to human-authored blogs, we demonstrate its stability, +validity, and superiority over prior psychological tools. Then, extending GPV +to LLM value measurement, we advance the current art with 1) a psychometric +methodology that measures LLM values based on their scalable and free-form +outputs, enabling context-specific measurement; 2) a comparative analysis of +measurement paradigms, indicating response biases of prior methods; and 3) an +attempt to bridge LLM values and their safety, revealing the predictive power +of different value systems and the impacts of various values on LLM safety. +Through interdisciplinary efforts, we aim to leverage AI for next-generation +psychometrics and psychometrics for value-aligned AI. + +
+
+ comment: Accepted at AAAI 2025 +
+
+
+
+
+ + ♻ ☆ BackdoorMBTI: A Backdoor Learning Multimodal Benchmark Tool Kit for + Backdoor Defense Evaluation + + +
+ Over the past few years, the emergence of backdoor attacks has presented +significant challenges to deep learning systems, allowing attackers to insert +backdoors into neural networks. When data with a trigger is processed by a +backdoor model, it can lead to mispredictions targeted by attackers, whereas +normal data yields regular results. The scope of backdoor attacks is expanding +beyond computer vision and encroaching into areas such as natural language +processing and speech recognition. Nevertheless, existing backdoor defense +methods are typically tailored to specific data modalities, restricting their +application in multimodal contexts. While multimodal learning proves highly +applicable in facial recognition, sentiment analysis, action recognition, +visual question answering, the security of these models remains a crucial +concern. Specifically, there are no existing backdoor benchmarks targeting +multimodal applications or related tasks. + In order to facilitate the research in multimodal backdoor, we introduce +BackdoorMBTI, the first backdoor learning toolkit and benchmark designed for +multimodal evaluation across three representative modalities from eleven +commonly used datasets. BackdoorMBTI provides a systematic backdoor learning +pipeline, encompassing data processing, data poisoning, backdoor training, and +evaluation. The generated poison datasets and backdoor models enable detailed +evaluation of backdoor defenses. Given the diversity of modalities, +BackdoorMBTI facilitates systematic evaluation across different data types. +Furthermore, BackdoorMBTI offers a standardized approach to handling practical +factors in backdoor learning, such as issues related to data quality and +erroneous labels. We anticipate that BackdoorMBTI will expedite future research +in backdoor defense methods within a multimodal context. Code is available at +https://github.com/SJTUHaiyangYu/BackdoorMBTI. + +
+
+
+
+
+ + ♻ ☆ Revisiting Multi-Permutation Equivariance through the Lens of + Irreducible Representations + + +
+ This paper explores the characterization of equivariant linear layers for +representations of permutations and related groups. Unlike traditional +approaches, which address these problems using parameter-sharing, we consider +an alternative methodology based on irreducible representations and Schur's +lemma. Using this methodology, we obtain an alternative derivation for existing +models like DeepSets, 2-IGN graph equivariant networks, and Deep Weight Space +(DWS) networks. The derivation for DWS networks is significantly simpler than +that of previous results. + Next, we extend our approach to unaligned symmetric sets, where equivariance +to the wreath product of groups is required. Previous works have addressed this +problem in a rather restrictive setting, in which almost all wreath equivariant +layers are Siamese. In contrast, we give a full characterization of layers in +this case and show that there is a vast number of additional non-Siamese layers +in some settings. We also show empirically that these additional non-Siamese +layers can improve performance in tasks like graph anomaly detection, weight +space alignment, and learning Wasserstein distances. Our code is available at +\href{https://github.com/yonatansverdlov/Irreducible-Representations-of-Deep-Weight-Spaces}{GitHub}. + +
+
+
+
+
+ + ♻ ☆ Autoformalizing Natural Language to First-Order Logic: A Case Study in + Logical Fallacy Detection + + +
+ Translating natural language into formal language such as First-Order Logic +(FOL) is a foundational challenge in NLP with wide-ranging applications in +automated reasoning, misinformation tracking, and knowledge validation. In this +paper, we introduce Natural Language to First-Order Logic (NL2FOL), a framework +to autoformalize natural language to FOL step by step using Large Language +Models (LLMs). Our approach addresses key challenges in this translation +process, including the integration of implicit background knowledge. By +leveraging structured representations generated by NL2FOL, we use +Satisfiability Modulo Theory (SMT) solvers to reason about the logical validity +of natural language statements. We present logical fallacy detection as a case +study to evaluate the efficacy of NL2FOL. Being neurosymbolic, our approach +also provides interpretable insights into the reasoning process and +demonstrates robustness without requiring model fine-tuning or labeled training +data. Our framework achieves strong performance on multiple datasets. On the +LOGIC dataset, NL2FOL achieves an F1-score of 78%, while generalizing +effectively to the LOGICCLIMATE dataset with an F1-score of 80%. + +
+
+
+
+
+ + ♻ ☆ An LLM-based Agent for Reliable Docker Environment Configuration + + +
+ Environment configuration is a critical yet time-consuming step in software +development, especially when dealing with unfamiliar code repositories. While +Large Language Models (LLMs) demonstrate the potential to accomplish software +engineering tasks, existing methods for environment configuration often rely on +manual efforts or fragile scripts, leading to inefficiencies and unreliable +outcomes. We introduce Repo2Run, the first LLM-based agent designed to fully +automate environment configuration and generate executable Dockerfiles for +arbitrary Python repositories. We address two major challenges: (1) enabling +the LLM agent to configure environments within isolated Docker containers, and +(2) ensuring the successful configuration process is recorded and accurately +transferred to a Dockerfile without error. To achieve this, we propose atomic +configuration synthesis, featuring a dual-environment architecture (internal +and external environment) with a rollback mechanism to prevent environment +"pollution" from failed commands, guaranteeing atomic execution (execute fully +or not at all) and a Dockerfile generator to transfer successful configuration +steps into runnable Dockerfiles. We evaluate Repo2Run~on our proposed benchmark +of 420 recent Python repositories with unit tests, where it achieves an 86.0% +success rate, outperforming the best baseline by 63.9%. Repo2Run is available +at https://github.com/bytedance/Repo2Run. + +
+
+
+
+
+ + ♻ ☆ R2-KG: General-Purpose Dual-Agent Framework for Reliable Reasoning on + Knowledge Graphs + + +
+ Recent studies have combined Large Language Models (LLMs) with Knowledge +Graphs (KGs) to enhance reasoning, improving inference accuracy without +additional training while mitigating hallucination. However, existing +frameworks are often rigid, struggling to adapt to KG or task changes. They +also rely heavily on powerful LLMs for reliable (i.e., trustworthy) reasoning. +To address this, We introduce R2-KG, a plug-and-play, dual-agent framework that +separates reasoning into two roles: an Operator (a low-capacity LLM) that +gathers evidence and a Supervisor (a high-capacity LLM) that makes final +judgments. This design is cost-efficient for LLM inference while still +maintaining strong reasoning accuracy. Additionally, R2-KG employs an +Abstention mechanism, generating answers only when sufficient evidence is +collected from KG, which significantly enhances reliability. Experiments across +multiple KG-based reasoning tasks show that R2-KG consistently outperforms +baselines in both accuracy and reliability, regardless of the inherent +capability of LLMs used as the Operator. Further experiments reveal that the +single-agent version of R2-KG, equipped with a strict self-consistency +strategy, achieves significantly higher-than-baseline reliability while +reducing inference cost. However, it also leads to a higher abstention rate in +complex KGs. Our findings establish R2-KG as a flexible and cost-effective +solution for KG-based reasoning. It reduces reliance on high-capacity LLMs +while ensuring trustworthy inference. + +
+
+
+
+
+ + ♻ ☆ Markov Chain of Thought for Efficient Mathematical Reasoning NAACL 2025 + + +
+ Chain of Thought (CoT) of multi-step benefits from the logical structure of +the reasoning steps and task-specific actions, significantly enhancing the +mathematical reasoning capabilities of large language models. As the prevalence +of long CoT, the number of reasoning steps exceeds manageable token limits and +leads to higher computational demands. Inspired by the fundamental logic of +human cognition, "derive, then reduce", we conceptualize the standard +multi-step CoT as a novel Markov Chain of Thought (MCoT). In this study, we +consider the mathematical reasoning task, defining each reasoning step as text +accompanied by a Python code snippet. To facilitate a longer reasoning path, +self-correction is enabled through interactions with the code interpreter. Our +MCoT aims to compress previous reasoning steps into a simplified question, +enabling efficient next-step inference without relying on a lengthy KV cache. +In our experiments, we curate the $\texttt{MCoTInstruct}$ dataset, and the +empirical results indicate that MCoT not only significantly enhances efficiency +but also maintains comparable accuracy. While much remains to be explored, this +work paves the way for exploring the long CoT reasoning abilities of LLMs. The +code is available at https://github.com/james-yw/Markov-Chain-of-Thought + +
+
+ comment: Camera ready version for NAACL 2025 Main +
+
+
+
+
+ + ♻ ☆ Investigating Non-Transitivity in LLM-as-a-Judge + + +
+ Automatic evaluation methods based on large language models (LLMs) are +emerging as the standard tool for assessing the instruction-following abilities +of LLM-based agents. The most common method in this paradigm, pairwise +comparisons with a baseline model, critically depends on the assumption of +transitive preferences. However, the validity of this assumption remains +largely unexplored. In this study, we investigate the presence of +non-transitivity within the AlpacaEval framework and analyze its effects on +model rankings. We find that LLM judges exhibit non-transitive preferences, +leading to rankings that are sensitive to the choice of the baseline model. To +mitigate this issue, we show that round-robin tournaments combined with +Bradley-Terry models of preference can produce more reliable rankings. Notably, +our method increases both the Spearman correlation and the Kendall correlation +with Chatbot Arena (95.0% -> 96.4% and 82.1% -> 86.3% respectively). To address +the computational cost of round-robin tournaments, we propose Swiss-Wise +Iterative Matchmaking (Swim) tournaments, using a dynamic matching strategy to +capture the benefits of round-robin tournaments while maintaining computational +efficiency. + +
+
+ comment: 8 pages, 6 figures, 2 tables (30 pages, 11 figures, 8 tables + including references and appendices) +
+
+
+
+
+ + ♻ ☆ Towards Edge General Intelligence via Large Language Models: + Opportunities and Challenges + + +
+ Edge Intelligence (EI) has been instrumental in delivering real-time, +localized services by leveraging the computational capabilities of edge +networks. The integration of Large Language Models (LLMs) empowers EI to evolve +into the next stage: Edge General Intelligence (EGI), enabling more adaptive +and versatile applications that require advanced understanding and reasoning +capabilities. However, systematic exploration in this area remains +insufficient. This survey delineates the distinctions between EGI and +traditional EI, categorizing LLM-empowered EGI into three conceptual systems: +centralized, hybrid, and decentralized. For each system, we detail the +framework designs and review existing implementations. Furthermore, we evaluate +the performance and throughput of various Small Language Models (SLMs) that are +more suitable for development on edge devices. This survey provides researchers +with a comprehensive vision of EGI, offering insights into its vast potential +and establishing a foundation for future advancements in this rapidly evolving +field. + +
+
+
+
+
+ + ♻ ☆ Training and Evaluating Language Models with Template-based Data + Generation + + +
+ The rapid advancement of large language models (LLMs) such as GPT-3, PaLM, +and Llama has significantly transformed natural language processing, showcasing +remarkable capabilities in understanding and generating language. However, +these models often struggle with tasks requiring complex reasoning, +particularly in mathematical problem-solving, due in part to the scarcity of +large-scale, high-quality, domain-specific datasets necessary for training +sophisticated reasoning abilities. To address this limitation, we introduce +Template-based Data Generation (TDG), a novel approach that leverages LLMs +(GPT-4) to automatically generate parameterized meta-templates, which are then +used to synthesize a vast array of high-quality problems and solutions. +Leveraging TDG, we create TemplateMath Part I: TemplateGSM, a dataset +comprising over 7 million synthetically generated grade school math +problems--each accompanied by code-based and natural language solutions--with +the potential to generate an effectively unlimited number more. This dataset +alleviates the scarcity of large-scale mathematical datasets and serves as a +valuable resource for pre-training, fine-tuning, and evaluating LLMs in +mathematical reasoning. Our method not only enables the generation of virtually +infinite data but also elevates data augmentation to a new level by using GPT-4 +for meta-template generation, ensuring diverse and high-quality problem +structures. The TemplateMath Part I: TemplateGSM dataset is publicly available +at https://huggingface.co/datasets/math-ai/TemplateGSM. The code is available +at https://github.com/iiis-ai/TemplateMath. + +
+
+ comment: 9 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Legal Fact Prediction: The Missing Piece in Legal Judgment Prediction + + +
+ Legal judgment prediction (LJP), which enables litigants and their lawyers to +forecast judgment outcomes and refine litigation strategies, has emerged as a +crucial legal NLP task. Existing studies typically utilize legal facts, i.e., +facts that have been established by evidence and determined by the judge, to +predict the judgment. However, legal facts are often difficult to obtain in the +early stages of litigation, significantly limiting the practical applicability +of fact-based LJP. To address this limitation, we propose a novel legal NLP +task: \textit{legal fact prediction} (LFP), which takes the evidence submitted +by litigants for trial as input to predict legal facts, thereby empowering +fact-based LJP technologies to perform prediction in the absence of +ground-truth legal facts. We also propose the first benchmark dataset, +LFPBench, for evaluating the LFP task. Our extensive experiments on LFPBench +demonstrate the effectiveness of LFP-empowered LJP and highlight promising +research directions for LFP. Our code and data are available at +https://github.com/HPRCEST/LFPBench. + +
+
+
+
+
+ + ♻ ☆ Prompting with Phonemes: Enhancing LLMs' Multilinguality for Non-Latin + Script Languages NAACL 2025 + + +
+ Although multilingual LLMs have achieved remarkable performance across +benchmarks, we find they continue to underperform on non-Latin script languages +across contemporary LLM families. This discrepancy arises from the fact that +LLMs are pretrained with orthographic scripts, which are dominated by Latin +characters that obscure their shared phonology with non-Latin scripts. We +propose leveraging phonemic transcriptions as complementary signals to induce +script-invariant representations. Our study demonstrates that integrating +phonemic signals improves performance across both non-Latin and Latin script +languages, with a particularly significant impact on closing the performance +gap between the two. Through detailed experiments, we show that phonemic and +orthographic scripts retrieve distinct examples for in-context learning (ICL). +This motivates our proposed Mixed-ICL retrieval strategy, where further +aggregation from both leads to our significant performance improvements for +both Latin script languages (up to 12.6%) and non-Latin script languages (up to +15.1%) compared to randomized ICL retrieval. + +
+
+ comment: Accepted for NAACL 2025 (Main Conference) +
+
+
+
+
+ + ♻ ☆ PQMass: Probabilistic Assessment of the Quality of Generative Models + using Probability Mass Estimation ICLR 2025 + + +
+ We propose a likelihood-free method for comparing two distributions given +samples from each, with the goal of assessing the quality of generative models. +The proposed approach, PQMass, provides a statistically rigorous method for +assessing the performance of a single generative model or the comparison of +multiple competing models. PQMass divides the sample space into non-overlapping +regions and applies chi-squared tests to the number of data samples that fall +within each region, giving a p-value that measures the probability that the bin +counts derived from two sets of samples are drawn from the same multinomial +distribution. PQMass does not depend on assumptions regarding the density of +the true distribution, nor does it rely on training or fitting any auxiliary +models. We evaluate PQMass on data of various modalities and dimensions, +demonstrating its effectiveness in assessing the quality, novelty, and +diversity of generated samples. We further show that PQMass scales well to +moderately high-dimensional data and thus obviates the need for feature +extraction in practical applications. + +
+
+ comment: Published as a conference paper at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ SMAC-R1: The Emergence of Intelligence in Decision-Making Tasks + + +
+ StarCraft Multi-Agent Challenge (SMAC) has been one of the most commonly used +experimental environments in multi-agent reinforcement learning (MARL), where +the specific task is to control a set number of allied units to defeat enemy +forces. Traditional MARL algorithms often require interacting with the +environment for millions of steps to train a parametric model, of which the +resulting policies are typically non-interpretable with weak transferability. +In this paper, we introduce SMAC-R1 which is based on the Qwen2.5-7B-Base LLM +distilled from DeepSeek-Coder-v2.5-236B. Similar to online reinforcement +learning after behavior cloning in offline learning process, in our pipeline, +agents leverage the DeepSeek LLM to generate decision tree code by providing +task descriptions, and the agents are further self-reflected using feedback +from the rewards provided by the environment. Based on that, we augment the +generated scripts to fine-tune a small LLM, Qwen2.5-7B-Base, to distill the +decision-making ability via Supervised Fine-Tuning (SFT) and enhance the script +generation ability by the Group Relative Policy Optimization (GRPO) algorithm. +We conduct experiments in the original 23 SMAC tasks and 10 newly-designed +tasks to demonstrate that our method can produce high-quality, interpretable +decision trees with minimal environmental exploration. Moreover, these scripts +exhibit strong transferability, successfully applying to homogeneous SMAC +environments without modification. We believe this approach offers a new +direction for solving decision-making tasks and domain-specific LLM training +pipelines in the future. + +
+
+
+
+
+ + ♻ ☆ PQMass: Probabilistic Assessment of the Quality of Generative Models + using Probability Mass Estimation ICLR 2025 + + +
+ We propose a likelihood-free method for comparing two distributions given +samples from each, with the goal of assessing the quality of generative models. +The proposed approach, PQMass, provides a statistically rigorous method for +assessing the performance of a single generative model or the comparison of +multiple competing models. PQMass divides the sample space into non-overlapping +regions and applies chi-squared tests to the number of data samples that fall +within each region, giving a p-value that measures the probability that the bin +counts derived from two sets of samples are drawn from the same multinomial +distribution. PQMass does not depend on assumptions regarding the density of +the true distribution, nor does it rely on training or fitting any auxiliary +models. We evaluate PQMass on data of various modalities and dimensions, +demonstrating its effectiveness in assessing the quality, novelty, and +diversity of generated samples. We further show that PQMass scales well to +moderately high-dimensional data and thus obviates the need for feature +extraction in practical applications. + +
+
+ comment: Published as a conference paper at ICLR 2025 +
+
+
+
+
+
+
+
+ + Genomics 3 + +
+
+
+ + ☆ Large Language Models in Bioinformatics: A Survey + + +
+ Large Language Models (LLMs) are revolutionizing bioinformatics, enabling +advanced analysis of DNA, RNA, proteins, and single-cell data. This survey +provides a systematic review of recent advancements, focusing on genomic +sequence modeling, RNA structure prediction, protein function inference, and +single-cell transcriptomics. Meanwhile, we also discuss several key challenges, +including data scarcity, computational complexity, and cross-omics integration, +and explore future directions such as multimodal learning, hybrid AI models, +and clinical applications. By offering a comprehensive perspective, this paper +underscores the transformative potential of LLMs in driving innovations in +bioinformatics and precision medicine. + +
+
+
+
+
+ + ☆ Large Language Models for Zero-shot Inference of Causal Structures in + Biology ICLR 2025 + + +
+ Genes, proteins and other biological entities influence one another via +causal molecular networks. Causal relationships in such networks are mediated +by complex and diverse mechanisms, through latent variables, and are often +specific to cellular context. It remains challenging to characterise such +networks in practice. Here, we present a novel framework to evaluate large +language models (LLMs) for zero-shot inference of causal relationships in +biology. In particular, we systematically evaluate causal claims obtained from +an LLM using real-world interventional data. This is done over one hundred +variables and thousands of causal hypotheses. Furthermore, we consider several +prompting and retrieval-augmentation strategies, including large, and +potentially conflicting, collections of scientific articles. Our results show +that with tailored augmentation and prompting, even relatively small LLMs can +capture meaningful aspects of causal structure in biological systems. This +supports the notion that LLMs could act as orchestration tools in biological +discovery, by helping to distil current knowledge in ways amenable to +downstream analysis. Our approach to assessing LLMs with respect to +experimental data is relevant for a broad range of problems at the intersection +of causal learning, LLMs and scientific discovery. + +
+
+ comment: ICLR 2025 Workshop on Machine Learning for Genomics Explorations +
+
+
+
+
+ + ♻ ☆ GENERator: A Long-Context Generative Genomic Foundation Model + + +
+ Advancements in DNA sequencing technologies have significantly improved our +ability to decode genomic sequences. However, the prediction and interpretation +of these sequences remain challenging due to the intricate nature of genetic +material. Large language models (LLMs) have introduced new opportunities for +biological sequence analysis. Recent developments in genomic language models +have underscored the potential of LLMs in deciphering DNA sequences. +Nonetheless, existing models often face limitations in robustness and +application scope, primarily due to constraints in model structure and training +data scale. To address these limitations, we present GENERator, a generative +genomic foundation model featuring a context length of 98k base pairs (bp) and +1.2B parameters. Trained on an expansive dataset comprising 386B bp of +eukaryotic DNA, the GENERator demonstrates state-of-the-art performance across +both established and newly proposed benchmarks. The model adheres to the +central dogma of molecular biology, accurately generating protein-coding +sequences that translate into proteins structurally analogous to known +families. It also shows significant promise in sequence optimization, +particularly through the prompt-responsive generation of enhancer sequences +with specific activity profiles. These capabilities position the GENERator as a +pivotal tool for genomic research and biotechnological advancement, enhancing +our ability to interpret and predict complex biological systems and enabling +precise genomic interventions. Implementation details and supplementary +resources are available at https://github.com/GenerTeam/GENERator. + +
+
+
+
+
+
+
+
+ + Machine Learning 151 + +
+
+
+ + ☆ L$^2$M: Mutual Information Scaling Law for Long-Context Language + Modeling + + +
+ We rigorously establish a bipartite mutual information scaling law in natural +language that governs long-range dependencies. This scaling law, which we show +is distinct from and scales independently of the conventional two-point mutual +information, is the key to understanding long-context language modeling. Using +this scaling law, we formulate the Long-context Language Modeling (L$^2$M) +condition, which relates a model's capacity for effective long context length +modeling to the scaling of its latent state size for storing past information. +Our results are validated through experiments on both transformers and state +space models. This work establishes a theoretical foundation that guides the +development of large language models toward longer context lengths. + +
+
+ comment: 29 pages, 12 figures, 1 table +
+
+
+
+
+ + ☆ Enough Coin Flips Can Make LLMs Act Bayesian + + +
+ Large language models (LLMs) exhibit the ability to generalize given few-shot +examples in their input prompt, an emergent capability known as in-context +learning (ICL). We investigate whether LLMs utilize ICL to perform structured +reasoning in ways that are consistent with a Bayesian framework or rely on +pattern matching. Using a controlled setting of biased coin flips, we find +that: (1) LLMs often possess biased priors, causing initial divergence in +zero-shot settings, (2) in-context evidence outweighs explicit bias +instructions, (3) LLMs broadly follow Bayesian posterior updates, with +deviations primarily due to miscalibrated priors rather than flawed updates, +and (4) attention magnitude has negligible effect on Bayesian inference. With +sufficient demonstrations of biased coin flips via ICL, LLMs update their +priors in a Bayesian manner. + +
+
+
+
+
+ + ☆ Floxels: Fast Unsupervised Voxel Based Scene Flow Estimation CVPR 2025 + + +
+ Scene flow estimation is a foundational task for many robotic applications, +including robust dynamic object detection, automatic labeling, and sensor +synchronization. Two types of approaches to the problem have evolved: 1) +Supervised and 2) optimization-based methods. Supervised methods are fast +during inference and achieve high-quality results, however, they are limited by +the need for large amounts of labeled training data and are susceptible to +domain gaps. In contrast, unsupervised test-time optimization methods do not +face the problem of domain gaps but usually suffer from substantial runtime, +exhibit artifacts, or fail to converge to the right solution. In this work, we +mitigate several limitations of existing optimization-based methods. To this +end, we 1) introduce a simple voxel grid-based model that improves over the +standard MLP-based formulation in multiple dimensions and 2) introduce a new +multiframe loss formulation. 3) We combine both contributions in our new +method, termed Floxels. On the Argoverse 2 benchmark, Floxels is surpassed only +by EulerFlow among unsupervised methods while achieving comparable performance +at a fraction of the computational cost. Floxels achieves a massive speedup of +more than ~60 - 140x over EulerFlow, reducing the runtime from a day to 10 +minutes per sequence. Over the faster but low-quality baseline, NSFP, Floxels +achieves a speedup of ~14x. + +
+
+ comment: Accepted at CVPR 2025 +
+
+
+
+
+ + ☆ Predictable Scale: Part I -- Optimal Hyperparameter Scaling Law in Large + Language Model Pretraining + + +
+ The impressive capabilities of Large Language Models (LLMs) across diverse +tasks are now well-established, yet their effective deployment necessitates +careful hyperparameter optimization. Through extensive empirical studies +involving grid searches across diverse configurations, we discover universal +scaling laws governing these hyperparameters: optimal learning rate follows a +power-law relationship with both model parameters and data sizes, while optimal +batch size scales primarily with data sizes. Our analysis reveals a convex +optimization landscape for hyperparameters under fixed models and data size +conditions. This convexity implies an optimal hyperparameter plateau. We +contribute a universal, plug-and-play optimal hyperparameter tool for the +community. Its estimated values on the test set are merely 0.07\% away from the +globally optimal LLM performance found via an exhaustive search. These laws +demonstrate remarkable robustness across variations in model sparsity, training +data distribution, and model shape. To our best known, this is the first work +that unifies different model shapes and structures, such as Mixture-of-Experts +models and dense transformers, as well as establishes optimal hyperparameter +scaling laws across diverse data distributions. This exhaustive optimization +process demands substantial computational resources, utilizing nearly one +million NVIDIA H800 GPU hours to train 3,700 LLMs of varying sizes and +hyperparameters from scratch and consuming approximately 100 trillion tokens in +total. To facilitate reproducibility and further research, we will +progressively release all loss measurements and model checkpoints through our +designated repository https://step-law.github.io/ + +
+
+ comment: 19 pages +
+
+
+
+
+ + ☆ Scaling Rich Style-Prompted Text-to-Speech Datasets + + +
+ We introduce Paralinguistic Speech Captions (ParaSpeechCaps), a large-scale +dataset that annotates speech utterances with rich style captions. While rich +abstract tags (e.g. guttural, nasal, pained) have been explored in small-scale +human-annotated datasets, existing large-scale datasets only cover basic tags +(e.g. low-pitched, slow, loud). We combine off-the-shelf text and speech +embedders, classifiers and an audio language model to automatically scale rich +tag annotations for the first time. ParaSpeechCaps covers a total of 59 style +tags, including both speaker-level intrinsic tags and utterance-level +situational tags. It consists of 342 hours of human-labelled data (PSC-Base) +and 2427 hours of automatically annotated data (PSC-Scaled). We finetune +Parler-TTS, an open-source style-prompted TTS model, on ParaSpeechCaps, and +achieve improved style consistency (+7.9% Consistency MOS) and speech quality +(+15.5% Naturalness MOS) over the best performing baseline that combines +existing rich style tag datasets. We ablate several of our dataset design +choices to lay the foundation for future work in this space. Our dataset, +models and code are released at https://github.com/ajd12342/paraspeechcaps . + +
+
+
+
+
+ + ☆ Efficiently Escaping Saddle Points under Generalized Smoothness via + Self-Bounding Regularity + + +
+ In this paper, we study the problem of non-convex optimization on functions +that are not necessarily smooth using first order methods. Smoothness +(functions whose gradient and/or Hessian are Lipschitz) is not satisfied by +many machine learning problems in both theory and practice, motivating a recent +line of work studying the convergence of first order methods to first order +stationary points under appropriate generalizations of smoothness. + We develop a novel framework to study convergence of first order methods to +first and \textit{second} order stationary points under generalized smoothness, +under more general smoothness assumptions than the literature. Using our +framework, we show appropriate variants of GD and SGD (e.g. with appropriate +perturbations) can converge not just to first order but also \textit{second +order stationary points} in runtime polylogarithmic in the dimension. To our +knowledge, our work contains the first such result, as well as the first +'non-textbook' rate for non-convex optimization under generalized smoothness. +We demonstrate that several canonical non-convex optimization problems fall +under our setting and framework. + +
+
+ comment: 79 pages +
+
+
+
+
+ + ☆ Sample-Optimal Agnostic Boosting with Unlabeled Data + + +
+ Boosting provides a practical and provably effective framework for +constructing accurate learning algorithms from inaccurate rules of thumb. It +extends the promise of sample-efficient learning to settings where direct +Empirical Risk Minimization (ERM) may not be implementable efficiently. In the +realizable setting, boosting is known to offer this computational reprieve +without compromising on sample efficiency. However, in the agnostic case, +existing boosting algorithms fall short of achieving the optimal sample +complexity. + This paper highlights an unexpected and previously unexplored avenue of +improvement: unlabeled samples. We design a computationally efficient agnostic +boosting algorithm that matches the sample complexity of ERM, given +polynomially many additional unlabeled samples. In fact, we show that the total +number of samples needed, unlabeled and labeled inclusive, is never more than +that for the best known agnostic boosting algorithm -- so this result is never +worse -- while only a vanishing fraction of these need to be labeled for the +algorithm to succeed. This is particularly fortuitous for learning-theoretic +applications of agnostic boosting, which often take place in the +distribution-specific setting, where unlabeled samples can be availed for free. +We detail other applications of this result in reinforcement learning. + +
+
+
+
+
+ + ☆ Universality of Layer-Level Entropy-Weighted Quantization Beyond Model + Architecture and Size + + +
+ We present a novel approach to selective model quantization that transcends +the limitations of architecture-specific and size-dependent compression methods +for Large Language Models (LLMs) using Entropy-Weighted Quantization (EWQ). By +analyzing the entropy distribution across transformer blocks, EWQ determines +which blocks can be safely quantized without causing significant performance +degradation, independent of model architecture or size. Our method outperforms +uniform quantization approaches, maintaining Massive Multitask Language +Understanding (MMLU) accuracy scores within 0.5% of unquantized models while +reducing memory usage by up to 18%. We demonstrate the effectiveness of EWQ +across multiple architectures-from 1.6B to 70B parameters-showcasing consistent +improvements in the quality-compression trade-off regardless of model scale or +architectural design. A surprising finding of EWQ is its ability to reduce +perplexity compared to unquantized models, suggesting the presence of +beneficial regularization through selective precision reduction. This +improvement holds across different model families, indicating a fundamental +relationship between layer-level entropy and optimal precision requirements. +Additionally, we introduce FastEWQ, a rapid method for entropy distribution +analysis that eliminates the need for loading model weights. This technique +leverages universal characteristics of entropy distribution that persist across +various architectures and scales, enabling near-instantaneous quantization +decisions while maintaining 80% classification accuracy with full entropy +analysis. Our results demonstrate that effective quantization strategies can be +developed independently of specific architectural choices or model sizes, +opening new possibilities for efficient LLM deployment. + +
+
+ comment: 29 pages, 7 figures, 14 tables; Comments are welcome +
+
+
+
+
+ + ☆ L1: Controlling How Long A Reasoning Model Thinks With Reinforcement + Learning + + +
+ Reasoning language models have shown an uncanny ability to improve +performance at test-time by ``thinking longer''-that is, by generating longer +chain-of-thought sequences and hence using more compute. However, the length of +their chain-of-thought reasoning is not controllable, making it impossible to +allocate test-time compute to achieve a desired level of performance. We +introduce Length Controlled Policy Optimization (LCPO), a simple reinforcement +learning method that optimizes for accuracy and adherence to user-specified +length constraints. We use LCPO to train L1, a reasoning language model that +produces outputs satisfying a length constraint given in its prompt. L1's +length control allows for smoothly trading off computational cost and accuracy +on a wide range of tasks, and outperforms the state-of-the-art S1 method for +length control. Furthermore, we uncover an unexpected short chain-of-thought +capability in models trained with LCPO. For instance, our 1.5B L1 model +surpasses GPT-4o at equal reasoning lengths. Overall, LCPO enables precise +control over reasoning length, allowing for fine-grained allocation of +test-time compute and accuracy. We release code and models at +https://www.cmu-l3.github.io/l1 + +
+
+
+
+
+ + ☆ Coarse graining and reduced order models for plume ejection dynamics + + +
+ Monitoring the atmospheric dispersion of pollutants is increasingly critical +for environmental impact assessments. High-fidelity computational models are +often employed to simulate plume dynamics, guiding decision-making and +prioritizing resource deployment. However, such models can be prohibitively +expensive to simulate, as they require resolving turbulent flows at fine +spatial and temporal resolutions. Moreover, there are at least two distinct +dynamical regimes of interest in the plume: (i) the initial ejection of the +plume where turbulent mixing is generated by the shear-driven Kelvin-Helmholtz +instability, and (ii) the ensuing turbulent diffusion and advection which is +often modeled by the Gaussian plume model. We address the challenge of modeling +the initial plume generation. Specifically, we propose a data-driven framework +that identifies a reduced-order analytical model for plume dynamics -- directly +from video data. We extract a time series of plume center and edge points from +video snapshots and evaluate different regressions based to their extrapolation +performance to generate a time series of coefficients that characterize the +plume's overall direction and spread. We regress to a sinusoidal model inspired +by the Kelvin-Helmholtz instability for the edge points in order to identify +the plume's dispersion and vorticity. Overall, this reduced-order modeling +framework provides a data-driven and lightweight approach to capture the +dominant features of the initial nonlinear point-source plume dynamics, +agnostic to plume type and starting only from video. The resulting model is a +pre-cursor to standard models such as the Gaussian plume model and has the +potential to enable rapid assessment and evaluation of critical environmental +hazards, such as methane leaks, chemical spills, and pollutant dispersal from +smokestacks. + +
+
+
+
+
+ + ☆ Compositional World Knowledge leads to High Utility Synthetic data + + +
+ Machine learning systems struggle with robustness, under subpopulation +shifts. This problem becomes especially pronounced in scenarios where only a +subset of attribute combinations is observed during training -a severe form of +subpopulation shift, referred as compositional shift. To address this problem, +we ask the following question: Can we improve the robustness by training on +synthetic data, spanning all possible attribute combinations? We first show +that training of conditional diffusion models on limited data lead to incorrect +underlying distribution. Therefore, synthetic data sampled from such models +will result in unfaithful samples and does not lead to improve performance of +downstream machine learning systems. To address this problem, we propose CoInD +to reflect the compositional nature of the world by enforcing conditional +independence through minimizing Fisher's divergence between joint and marginal +distributions. We demonstrate that synthetic data generated by CoInD is +faithful and this translates to state-of-the-art worst-group accuracy on +compositional shift tasks on CelebA. + +
+
+
+
+
+ + ☆ Propagating Model Uncertainty through Filtering-based Probabilistic + Numerical ODE Solvers + + +
+ Filtering-based probabilistic numerical solvers for ordinary differential +equations (ODEs), also known as ODE filters, have been established as efficient +methods for quantifying numerical uncertainty in the solution of ODEs. In +practical applications, however, the underlying dynamical system often contains +uncertain parameters, requiring the propagation of this model uncertainty to +the ODE solution. In this paper, we demonstrate that ODE filters, despite their +probabilistic nature, do not automatically solve this uncertainty propagation +problem. To address this limitation, we present a novel approach that combines +ODE filters with numerical quadrature to properly marginalize over uncertain +parameters, while accounting for both parameter uncertainty and numerical +solver uncertainty. Experiments across multiple dynamical systems demonstrate +that the resulting uncertainty estimates closely match reference solutions. +Notably, we show how the numerical uncertainty from the ODE solver can help +prevent overconfidence in the propagated uncertainty estimates, especially when +using larger step sizes. Our results illustrate that probabilistic numerical +methods can effectively quantify both numerical and parametric uncertainty in +dynamical systems. + +
+
+
+
+
+ + ☆ Matrix Factorization for Inferring Associations and Missing Links + + +
+ Missing link prediction is a method for network analysis, with applications +in recommender systems, biology, social sciences, cybersecurity, information +retrieval, and Artificial Intelligence (AI) reasoning in Knowledge Graphs. +Missing link prediction identifies unseen but potentially existing connections +in a network by analyzing the observed patterns and relationships. In +proliferation detection, this supports efforts to identify and characterize +attempts by state and non-state actors to acquire nuclear weapons or associated +technology - a notoriously challenging but vital mission for global security. +Dimensionality reduction techniques like Non-Negative Matrix Factorization +(NMF) and Logistic Matrix Factorization (LMF) are effective but require +selection of the matrix rank parameter, that is, of the number of hidden +features, k, to avoid over/under-fitting. We introduce novel Weighted (WNMFk), +Boolean (BNMFk), and Recommender (RNMFk) matrix factorization methods, along +with ensemble variants incorporating logistic factorization, for link +prediction. Our methods integrate automatic model determination for rank +estimation by evaluating stability and accuracy using a modified bootstrap +methodology and uncertainty quantification (UQ), assessing prediction +reliability under random perturbations. We incorporate Otsu threshold selection +and k-means clustering for Boolean matrix factorization, comparing them to +coordinate descent-based Boolean thresholding. Our experiments highlight the +impact of rank k selection, evaluate model performance under varying test-set +sizes, and demonstrate the benefits of UQ for reliable predictions using +abstention. We validate our methods on three synthetic datasets (Boolean and +uniformly distributed) and benchmark them against LMF and symmetric LMF +(symLMF) on five real-world protein-protein interaction networks, showcasing an +improved prediction performance. + +
+
+ comment: 35 pages, 14 figures, 3 tables, 1 algorithm +
+
+
+
+
+ + ☆ Multi-Agent Inverse Q-Learning from Demonstrations ICRA + + +
+ When reward functions are hand-designed, deep reinforcement learning +algorithms often suffer from reward misspecification, causing them to learn +suboptimal policies in terms of the intended task objectives. In the +single-agent case, inverse reinforcement learning (IRL) techniques attempt to +address this issue by inferring the reward function from expert demonstrations. +However, in multi-agent problems, misalignment between the learned and true +objectives is exacerbated due to increased environment non-stationarity and +variance that scales with multiple agents. As such, in multi-agent general-sum +games, multi-agent IRL algorithms have difficulty balancing cooperative and +competitive objectives. To address these issues, we propose Multi-Agent +Marginal Q-Learning from Demonstrations (MAMQL), a novel sample-efficient +framework for multi-agent IRL. For each agent, MAMQL learns a critic +marginalized over the other agents' policies, allowing for a well-motivated use +of Boltzmann policies in the multi-agent context. We identify a connection +between optimal marginalized critics and single-agent soft-Q IRL, allowing us +to apply a direct, simple optimization criterion from the single-agent domain. +Across our experiments on three different simulated domains, MAMQL +significantly outperforms previous multi-agent methods in average reward, +sample efficiency, and reward recovery by often more than 2-5x. We make our +code available at https://sites.google.com/view/mamql . + +
+
+ comment: 8 pages, 4 figures, 2 tables. Published at the International + Conference on Robotics and Automation (ICRA) 2025 +
+
+
+
+
+ + ☆ An Information-theoretic Multi-task Representation Learning Framework + for Natural Language Understanding AAAI 2025 + + +
+ This paper proposes a new principled multi-task representation learning +framework (InfoMTL) to extract noise-invariant sufficient representations for +all tasks. It ensures sufficiency of shared representations for all tasks and +mitigates the negative effect of redundant features, which can enhance language +understanding of pre-trained language models (PLMs) under the multi-task +paradigm. Firstly, a shared information maximization principle is proposed to +learn more sufficient shared representations for all target tasks. It can avoid +the insufficiency issue arising from representation compression in the +multi-task paradigm. Secondly, a task-specific information minimization +principle is designed to mitigate the negative effect of potential redundant +features in the input for each task. It can compress task-irrelevant redundant +information and preserve necessary information relevant to the target for +multi-task prediction. Experiments on six classification benchmarks show that +our method outperforms 12 comparative multi-task methods under the same +multi-task settings, especially in data-constrained and noisy scenarios. +Extensive experiments demonstrate that the learned representations are more +sufficient, data-efficient, and robust. + +
+
+ comment: 11 pages, accepted to AAAI 2025 (main conference), the code is + available at https://github.com/zerohd4869/InfoMTL +
+
+
+
+
+ + ☆ CLDyB: Towards Dynamic Benchmarking for Continual Learning with + Pre-trained Models + + +
+ The advent of the foundation model era has sparked significant research +interest in leveraging pre-trained representations for continual learning (CL), +yielding a series of top-performing CL methods on standard evaluation +benchmarks. Nonetheless, there are growing concerns regarding potential data +contamination during the pre-training stage. Furthermore, standard evaluation +benchmarks, which are typically static, fail to capture the complexities of +real-world CL scenarios, resulting in saturated performance. To address these +issues, we describe CL on dynamic benchmarks (CLDyB), a general computational +framework based on Markov decision processes for evaluating CL methods +reliably. CLDyB dynamically identifies inherently difficult and +algorithm-dependent tasks for the given CL methods, and determines challenging +task orders using Monte Carlo tree search. Leveraging CLDyB, we first conduct a +joint evaluation of multiple state-of-the-art CL methods, leading to a set of +commonly challenging and generalizable task sequences where existing CL methods +tend to perform poorly. We then conduct separate evaluations of individual CL +methods using CLDyB, discovering their respective strengths and weaknesses. The +source code and generated task sequences are publicly accessible at +https://github.com/szc12153/CLDyB. + +
+
+
+
+
+ + ☆ Joint Masked Reconstruction and Contrastive Learning for Mining + Interactions Between Proteins + + +
+ Protein-protein interaction (PPI) prediction is an instrumental means in +elucidating the mechanisms underlying cellular operations, holding significant +practical implications for the realms of pharmaceutical development and +clinical treatment. Presently, the majority of research methods primarily +concentrate on the analysis of amino acid sequences, while investigations +predicated on protein structures remain in the nascent stages of exploration. +Despite the emergence of several structure-based algorithms in recent years, +these are still confronted with inherent challenges: (1) the extraction of +intrinsic structural information of proteins typically necessitates the +expenditure of substantial computational resources; (2) these models are overly +reliant on seen protein data, struggling to effectively unearth interaction +cues between unknown proteins. To further propel advancements in this domain, +this paper introduces a novel PPI prediction method jointing masked +reconstruction and contrastive learning, termed JmcPPI. This methodology +dissects the PPI prediction task into two distinct phases: during the residue +structure encoding phase, JmcPPI devises two feature reconstruction tasks and +employs graph attention mechanism to capture structural information between +residues; during the protein interaction inference phase, JmcPPI perturbs the +original PPI graph and employs a multi-graph contrastive learning strategy to +thoroughly mine extrinsic interaction information of novel proteins. Extensive +experiments conducted on three widely utilized PPI datasets demonstrate that +JmcPPI surpasses existing optimal baseline models across various data partition +schemes. The associated code can be accessed via +https://github.com/lijfrank-open/JmcPPI. + +
+
+ comment: Submitted +
+
+
+
+
+ + ☆ Transferable Foundation Models for Geometric Tasks on Point Cloud + Representations: Geometric Neural Operators + + +
+ We introduce methods for obtaining pretrained Geometric Neural Operators +(GNPs) that can serve as basal foundation models for use in obtaining geometric +features. These can be used within data processing pipelines for machine +learning tasks and numerical methods. We show how our GNPs can be trained to +learn robust latent representations for the differential geometry of +point-clouds to provide estimates of metric, curvature, and other shape-related +features. We demonstrate how our pre-trained GNPs can be used (i) to estimate +the geometric properties of surfaces of arbitrary shape and topologies with +robustness in the presence of noise, (ii) to approximate solutions of geometric +partial differential equations (PDEs) on manifolds, and (iii) to solve +equations for shape deformations such as curvature driven flows. We also +release a package of the codes and weights for using our pre-trained GNPs for +processing point cloud representations. This allows for incorporating our +pre-trained GNPs as components for reuse within existing and new data +processing pipelines. The GNPs also can be used as part of numerical solvers +involving geometry or as part of methods for performing inference and other +geometric tasks. + +
+
+
+
+
+ + Simulating the Real World: A Unified Survey of Multimodal Generative + Models + + +
+ Understanding and replicating the real world is a critical challenge in +Artificial General Intelligence (AGI) research. To achieve this, many existing +approaches, such as world models, aim to capture the fundamental principles +governing the physical world, enabling more accurate simulations and meaningful +interactions. However, current methods often treat different modalities, +including 2D (images), videos, 3D, and 4D representations, as independent +domains, overlooking their interdependencies. Additionally, these methods +typically focus on isolated dimensions of reality without systematically +integrating their connections. In this survey, we present a unified survey for +multimodal generative models that investigate the progression of data +dimensionality in real-world simulation. Specifically, this survey starts from +2D generation (appearance), then moves to video (appearance+dynamics) and 3D +generation (appearance+geometry), and finally culminates in 4D generation that +integrate all dimensions. To the best of our knowledge, this is the first +attempt to systematically unify the study of 2D, video, 3D and 4D generation +within a single framework. To guide future research, we provide a comprehensive +review of datasets, evaluation metrics and future directions, and fostering +insights for newcomers. This survey serves as a bridge to advance the study of +multimodal generative models and real-world simulation within a unified +framework. + +
+
+ comment: Repository for the related papers at + https://github.com/ALEEEHU/World-Simulator +
+
+
+
+
+ + ☆ Enhancing SAM with Efficient Prompting and Preference Optimization for + Semi-supervised Medical Image Segmentation CVPR 2025 + + +
+ Foundational models such as the Segment Anything Model (SAM) are gaining +traction in medical imaging segmentation, supporting multiple downstream tasks. +However, such models are supervised in nature, still relying on large annotated +datasets or prompts supplied by experts. Conventional techniques such as active +learning to alleviate such limitations are limited in scope and still +necessitate continuous human involvement and complex domain knowledge for label +refinement or establishing reward ground truth. To address these challenges, we +propose an enhanced Segment Anything Model (SAM) framework that utilizes +annotation-efficient prompts generated in a fully unsupervised fashion, while +still capturing essential semantic, location, and shape information through +contrastive language-image pretraining and visual question answering. We adopt +the direct preference optimization technique to design an optimal policy that +enables the model to generate high-fidelity segmentations with simple ratings +or rankings provided by a virtual annotator simulating the human annotation +process. State-of-the-art performance of our framework in tasks such as lung +segmentation, breast tumor segmentation, and organ segmentation across various +modalities, including X-ray, ultrasound, and abdominal CT, justifies its +effectiveness in low-annotation data scenarios. + +
+
+ comment: Accepted to CVPR 2025 +
+
+
+
+
+ + ☆ No Forgetting Learning: Memory-free Continual Learning ICCV 2025 + + +
+ Continual Learning (CL) remains a central challenge in deep learning, where +models must sequentially acquire new knowledge while mitigating Catastrophic +Forgetting (CF) of prior tasks. Existing approaches often struggle with +efficiency and scalability, requiring extensive memory or model buffers. This +work introduces ``No Forgetting Learning" (NFL), a memory-free CL framework +that leverages knowledge distillation to maintain stability while preserving +plasticity. Memory-free means the NFL does not rely on any memory buffer. +Through extensive evaluations of three benchmark datasets, we demonstrate that +NFL achieves competitive performance while utilizing approximately 14.75 times +less memory than state-of-the-art methods. Furthermore, we introduce a new +metric to better assess CL's plasticity-stability trade-off. + +
+
+ comment: This paper is submitted to ICCV 2025 +
+
+
+
+
+ + ☆ Mark Your LLM: Detecting the Misuse of Open-Source Large Language Models + via Watermarking ICLR 2025 + + +
+ As open-source large language models (LLMs) like Llama3 become more capable, +it is crucial to develop watermarking techniques to detect their potential +misuse. Existing watermarking methods either add watermarks during LLM +inference, which is unsuitable for open-source LLMs, or primarily target +classification LLMs rather than recent generative LLMs. Adapting these +watermarks to open-source LLMs for misuse detection remains an open challenge. +This work defines two misuse scenarios for open-source LLMs: intellectual +property (IP) violation and LLM Usage Violation. Then, we explore the +application of inference-time watermark distillation and backdoor watermarking +in these contexts. We propose comprehensive evaluation methods to assess the +impact of various real-world further fine-tuning scenarios on watermarks and +the effect of these watermarks on LLM performance. Our experiments reveal that +backdoor watermarking could effectively detect IP Violation, while +inference-time watermark distillation is applicable in both scenarios but less +robust to further fine-tuning and has a more significant impact on LLM +performance compared to backdoor watermarking. Exploring more advanced +watermarking methods for open-source LLMs to detect their misuse should be an +important future direction. + +
+
+ comment: Accepted by the 1st Workshop on GenAI Watermarking, collocated with + ICLR 2025 +
+
+
+
+
+ + ☆ IDInit: A Universal and Stable Initialization Method for Neural Network + Training ICLR 2025 + + +
+ Deep neural networks have achieved remarkable accomplishments in practice. +The success of these networks hinges on effective initialization methods, which +are vital for ensuring stable and rapid convergence during training. Recently, +initialization methods that maintain identity transition within layers have +shown good efficiency in network training. These techniques (e.g., Fixup) set +specific weights to zero to achieve identity control. However, settings of +remaining weight (e.g., Fixup uses random values to initialize non-zero +weights) will affect the inductive bias that is achieved only by a zero weight, +which may be harmful to training. Addressing this concern, we introduce fully +identical initialization (IDInit), a novel method that preserves identity in +both the main and sub-stem layers of residual networks. IDInit employs a padded +identity-like matrix to overcome rank constraints in non-square weight +matrices. Furthermore, we show the convergence problem of an identity matrix +can be solved by stochastic gradient descent. Additionally, we enhance the +universality of IDInit by processing higher-order weights and addressing dead +neuron problems. IDInit is a straightforward yet effective initialization +method, with improved convergence, stability, and performance across various +settings, including large-scale datasets and deep models. + +
+
+ comment: Accepted in ICLR 2025 +
+
+
+
+
+ + ☆ The Best of Both Worlds: Integrating Language Models and Diffusion + Models for Video Generation + + +
+ Recent advancements in text-to-video (T2V) generation have been driven by two +competing paradigms: autoregressive language models and diffusion models. +However, each paradigm has intrinsic limitations: language models struggle with +visual quality and error accumulation, while diffusion models lack semantic +understanding and causal modeling. In this work, we propose LanDiff, a hybrid +framework that synergizes the strengths of both paradigms through +coarse-to-fine generation. Our architecture introduces three key innovations: +(1) a semantic tokenizer that compresses 3D visual features into compact 1D +discrete representations through efficient semantic compression, achieving a +$\sim$14,000$\times$ compression ratio; (2) a language model that generates +semantic tokens with high-level semantic relationships; (3) a streaming +diffusion model that refines coarse semantics into high-fidelity videos. +Experiments show that LanDiff, a 5B model, achieves a score of 85.43 on the +VBench T2V benchmark, surpassing the state-of-the-art open-source models +Hunyuan Video (13B) and other commercial models such as Sora, Keling, and +Hailuo. Furthermore, our model also achieves state-of-the-art performance in +long video generation, surpassing other open-source models in this field. Our +demo can be viewed at https://landiff.github.io/. + +
+
+
+
+
+ + ☆ Fusion of Various Optimization Based Feature Smoothing Methods for + Wearable and Non-invasive Blood Glucose Estimation + + +
+ Recently, the wearable and non-invasive blood glucose estimation approach has +been proposed. However, due to the unreliability of the acquisition device, the +presence of the noise and the variations of the acquisition environments, the +obtained features and the reference blood glucose values are highly unreliable. +To address this issue, this paper proposes a polynomial fitting approach to +smooth the obtained features or the reference blood glucose values. First, the +blood glucose values are estimated based on the individual optimization +approaches. Second, the absolute difference values between the estimated blood +glucose values and the actual blood glucose values based on each optimization +approach are computed. Third, these absolute difference values for each +optimization approach are sorted in the ascending order. Fourth, for each +sorted blood glucose value, the optimization method corresponding to the +minimum absolute difference value is selected. Fifth, the accumulate +probability of each selected optimization method is computed. If the accumulate +probability of any selected optimization method at a point is greater than a +threshold value, then the accumulate probabilities of these three selected +optimization methods at that point are reset to zero. A range of the sorted +blood glucose values are defined as that with the corresponding boundaries +points being the previous reset point and this reset point. Hence, after +performing the above procedures for all the sorted reference blood glucose +values in the validation set, the regions of the sorted reference blood glucose +values and the corresponding optimization methods in these regions are +determined. The computer numerical simulation results show that our proposed +method yields the mean absolute relative deviation (MARD) at 0.0930 and the +percentage of the test data falling in the zone A of the Clarke error grid at +94.1176%. + +
+
+ comment: This version corrects several typos +
+
+
+
+
+ + ☆ HybridNorm: Towards Stable and Efficient Transformer Training via Hybrid + Normalization + + +
+ Transformers have become the de facto architecture for a wide range of +machine learning tasks, particularly in large language models (LLMs). Despite +their remarkable performance, challenges remain in training deep transformer +networks, especially regarding the location of layer normalization. While +Pre-Norm structures facilitate easier training due to their more prominent +identity path, they often yield suboptimal performance compared to Post-Norm. +In this paper, we propose $\textbf{HybridNorm}$, a straightforward yet +effective hybrid normalization strategy that integrates the advantages of both +Pre-Norm and Post-Norm approaches. Specifically, HybridNorm employs QKV +normalization within the attention mechanism and Post-Norm in the feed-forward +network (FFN) of each transformer block. This design not only stabilizes +training but also enhances performance, particularly in the context of LLMs. +Comprehensive experiments in both dense and sparse architectures show that +HybridNorm consistently outperforms both Pre-Norm and Post-Norm approaches, +achieving state-of-the-art results across various benchmarks. These findings +highlight the potential of HybridNorm as a more stable and effective technique +for improving the training and performance of deep transformer models. %Code +will be made publicly available. Code is available at +https://github.com/BryceZhuo/HybridNorm. + +
+
+
+
+
+ + ☆ Advancing Solutions for the Three-Body Problem Through Physics-Informed + Neural Networks + + +
+ First formulated by Sir Isaac Newton in his work "Philosophiae Naturalis +Principia Mathematica", the concept of the Three-Body Problem was put forth as +a study of the motion of the three celestial bodies within the Earth-Sun-Moon +system. In a generalized definition, it seeks to predict the motion for an +isolated system composed of three point masses freely interacting under +Newton's law of universal attraction. This proves to be analogous to a +multitude of interactions between celestial bodies, and thus, the problem finds +applicability within the studies of celestial mechanics. Despite numerous +attempts by renowned physicists to solve it throughout the last three +centuries, no general closed-form solutions have been reached due to its +inherently chaotic nature for most initial conditions. Current state-of-the-art +solutions are based on two approaches, either numerical high-precision +integration or machine learning-based. Notwithstanding the breakthroughs of +neural networks, these present a significant limitation, which is their +ignorance of any prior knowledge of the chaotic systems presented. Thus, in +this work, we propose a novel method that utilizes Physics-Informed Neural +Networks (PINNs). These deep neural networks are able to incorporate any prior +system knowledge expressible as an Ordinary Differential Equation (ODE) into +their learning processes as a regularizing agent. Our findings showcase that +PINNs surpass current state-of-the-art machine learning methods with comparable +prediction quality. Despite a better prediction quality, the usability of +numerical integrators suffers due to their prohibitively high computational +cost. These findings confirm that PINNs are both effective and time-efficient +open-form solvers of the Three-Body Problem that capitalize on the extensive +knowledge we hold of classical mechanics. + +
+
+ comment: 14 pages, 25 figures, 3 tables. 75th International Astronautical + Congress (IAC), Milan, Italy, 14-18 October +
+
+
+
+
+ + ☆ PSDNorm: Test-Time Temporal Normalization for Deep Learning on EEG + Signals + + +
+ Distribution shift poses a significant challenge in machine learning, +particularly in biomedical applications such as EEG signals collected across +different subjects, institutions, and recording devices. While existing +normalization layers, Batch-Norm, LayerNorm and InstanceNorm, help address +distribution shifts, they fail to capture the temporal dependencies inherent in +temporal signals. In this paper, we propose PSDNorm, a layer that leverages +Monge mapping and temporal context to normalize feature maps in deep learning +models. Notably, the proposed method operates as a test-time domain adaptation +technique, addressing distribution shifts without additional training. +Evaluations on 10 sleep staging datasets using the U-Time model demonstrate +that PSDNorm achieves state-of-the-art performance at test time on datasets not +seen during training while being 4x more data-efficient than the best baseline. +Additionally, PSDNorm provides a significant improvement in robustness, +achieving markedly higher F1 scores for the 20% hardest subjects. + +
+
+
+
+
+ + ☆ Data-augmented Learning of Geodesic Distances in Irregular Domains + through Soner Boundary Conditions + + +
+ Geodesic distances play a fundamental role in robotics, as they efficiently +encode global geometric information of the domain. Recent methods use neural +networks to approximate geodesic distances by solving the Eikonal equation +through physics-informed approaches. While effective, these approaches often +suffer from unstable convergence during training in complex environments. We +propose a framework to learn geodesic distances in irregular domains by using +the Soner boundary condition, and systematically evaluate the impact of data +losses on training stability and solution accuracy. Our experiments demonstrate +that incorporating data losses significantly improves convergence robustness, +reducing training instabilities and sensitivity to initialization. These +findings suggest that hybrid data-physics approaches can effectively enhance +the reliability of learning-based geodesic distance solvers with sparse data. + +
+
+
+
+
+ + ☆ Meta Learning not to Learn: Robustly Informing Meta-Learning under + Nuisance-Varying Families + + +
+ In settings where both spurious and causal predictors are available, standard +neural networks trained under the objective of empirical risk minimization +(ERM) with no additional inductive biases tend to have a dependence on a +spurious feature. As a result, it is necessary to integrate additional +inductive biases in order to guide the network toward generalizable hypotheses. +Often these spurious features are shared across related tasks, such as +estimating disease prognoses from image scans coming from different hospitals, +making the challenge of generalization more difficult. In these settings, it is +important that methods are able to integrate the proper inductive biases to +generalize across both nuisance-varying families as well as task families. +Motivated by this setting, we present RIME (Robustly Informed Meta lEarning), a +new method for meta learning under the presence of both positive and negative +inductive biases (what to learn and what not to learn). We first develop a +theoretical causal framework showing why existing approaches at knowledge +integration can lead to worse performance on distributionally robust +objectives. We then show that RIME is able to simultaneously integrate both +biases, reaching state of the art performance under distributionally robust +objectives in informed meta-learning settings under nuisance-varying families. + +
+
+
+
+
+ + ☆ Compositional Causal Reasoning Evaluation in Language Models + + +
+ Causal reasoning and compositional reasoning are two core aspirations in +generative AI. Measuring the extent of these behaviors requires principled +evaluation methods. We explore a unified perspective that considers both +behaviors simultaneously, termed compositional causal reasoning (CCR): the +ability to infer how causal measures compose and, equivalently, how causal +quantities propagate through graphs. We instantiate a framework for the +systematic evaluation of CCR for the average treatment effect and the +probability of necessity and sufficiency. As proof of concept, we demonstrate +the design of CCR tasks for language models in the LLama, Phi, and GPT +families. On a math word problem, our framework revealed a range of +taxonomically distinct error patterns. Additionally, CCR errors increased with +the complexity of causal paths for all models except o1. + +
+
+
+
+
+ + ☆ Federated Dynamic Modeling and Learning for Spatiotemporal Data + Forecasting + + +
+ This paper presents an advanced Federated Learning (FL) framework for +forecasting complex spatiotemporal data, improving upon recent state-of-the-art +models. In the proposed approach, the original Gated Recurrent Unit (GRU) +module within previous Dynamic Spatial--Temporal Graph Convolutional Recurrent +Network (DSTGCRN) modeling is first replaced with a Long Short-Term Memory +(LSTM) network, enabling the resulting model to more effectively capture +long-term dependencies inherent to time series data. The resulting architecture +significantly improves the model's capacity to handle complex temporal patterns +in diverse forecasting applications. Furthermore, the proposed FL framework +integrates a novel Client-Side Validation (CSV) mechanism, introducing a +critical validation step at the client level before incorporating aggregated +parameters from the central server into local models. This ensures that only +the most effective updates are adopted, improving both the robustness and +accuracy of the forecasting model across clients. The efficiency of our +approach is demonstrated through extensive experiments on real-world +applications, including public datasets for multimodal transport demand +forecasting and private datasets for Origin-Destination (OD) matrix forecasting +in urban areas. The results demonstrate substantial improvements over +conventional methods, highlighting the framework's ability to capture complex +spatiotemporal dependencies while preserving data privacy. This work not only +provides a scalable and privacy-preserving solution for real-time, +region-specific forecasting and management but also underscores the potential +of leveraging distributed data sources in a FL context. We provide our +algorithms as open-source on GitHub. + +
+
+
+
+
+ + ☆ Leveraging priors on distribution functions for multi-arm bandits + + +
+ We introduce Dirichlet Process Posterior Sampling (DPPS), a Bayesian +non-parametric algorithm for multi-arm bandits based on Dirichlet Process (DP) +priors. Like Thompson-sampling, DPPS is a probability-matching algorithm, i.e., +it plays an arm based on its posterior-probability of being optimal. Instead of +assuming a parametric class for the reward generating distribution of each arm, +and then putting a prior on the parameters, in DPPS the reward generating +distribution is directly modeled using DP priors. DPPS provides a principled +approach to incorporate prior belief about the bandit environment, and in the +noninformative limit of the DP posteriors (i.e. Bayesian Bootstrap), we recover +Non Parametric Thompson Sampling (NPTS), a popular non-parametric bandit +algorithm, as a special case of DPPS. We employ stick-breaking representation +of the DP priors, and show excellent empirical performance of DPPS in +challenging synthetic and real world bandit environments. Finally, using an +information-theoretic analysis, we show non-asymptotic optimality of DPPS in +the Bayesian regret setup. + +
+
+
+
+
+ + ☆ STX-Search: Explanation Search for Continuous Dynamic Spatio-Temporal + Models + + +
+ Recent improvements in the expressive power of spatio-temporal models have +led to performance gains in many real-world applications, such as traffic +forecasting and social network modelling. However, understanding the +predictions from a model is crucial to ensure reliability and trustworthiness, +particularly for high-risk applications, such as healthcare and transport. Few +existing methods are able to generate explanations for models trained on +continuous-time dynamic graph data and, of these, the computational complexity +and lack of suitable explanation objectives pose challenges. In this paper, we +propose $\textbf{S}$patio-$\textbf{T}$emporal E$\textbf{X}$planation +$\textbf{Search}$ (STX-Search), a novel method for generating instance-level +explanations that is applicable to static and dynamic temporal graph +structures. We introduce a novel search strategy and objective function, to +find explanations that are highly faithful and interpretable. When compared +with existing methods, STX-Search produces explanations of higher fidelity +whilst optimising explanation size to maintain interpretability. + +
+
+
+
+
+ + ☆ A Morse Transform for Drug Discovery + + +
+ We introduce a new ligand-based virtual screening (LBVS) framework that uses +piecewise linear (PL) Morse theory to predict ligand binding potential. We +model ligands as simplicial complexes via a pruned Delaunay triangulation, and +catalogue the critical points across multiple directional height functions. +This produces a rich feature vector, consisting of crucial topological features +-- peaks, troughs, and saddles -- that characterise ligand surfaces relevant to +binding interactions. Unlike contemporary LBVS methods that rely on +computationally-intensive deep neural networks, we require only a lightweight +classifier. The Morse theoretic approach achieves state-of-the-art performance +on standard datasets while offering an interpretable feature vector and +scalable method for ligand prioritization in early-stage drug discovery. + +
+
+ comment: 25 pages, 5 main figures, 2 main tables, 6 supplementary figures and + 4 supplementary tables +
+
+
+
+
+ + ☆ Learning Object Placement Programs for Indoor Scene Synthesis with + Iterative Self Training + + +
+ Data driven and autoregressive indoor scene synthesis systems generate indoor +scenes automatically by suggesting and then placing objects one at a time. +Empirical observations show that current systems tend to produce incomplete +next object location distributions. We introduce a system which addresses this +problem. We design a Domain Specific Language (DSL) that specifies functional +constraints. Programs from our language take as input a partial scene and +object to place. Upon execution they predict possible object placements. We +design a generative model which writes these programs automatically. Available +3D scene datasets do not contain programs to train on, so we build upon +previous work in unsupervised program induction to introduce a new program +bootstrapping algorithm. In order to quantify our empirical observations we +introduce a new evaluation procedure which captures how well a system models +per-object location distributions. We ask human annotators to label all the +possible places an object can go in a scene and show that our system produces +per-object location distributions more consistent with human annotators. Our +system also generates indoor scenes of comparable quality to previous systems +and while previous systems degrade in performance when training data is sparse, +our system does not degrade to the same degree. + +
+
+ comment: 21 pages, 20 figures Subjects: Graphics (cs.GR), Computer Vision and + Pattern Recognition (cs.CV), Machine Learning (cs.LG) +
+
+
+
+
+ + ☆ Accurate predictive model of band gap with selected important features + based on explainable machine learning + + +
+ In the rapidly advancing field of materials informatics, nonlinear machine +learning models have demonstrated exceptional predictive capabilities for +material properties. However, their black-box nature limits interpretability, +and they may incorporate features that do not contribute to, or even +deteriorate, model performance. This study employs explainable ML (XML) +techniques, including permutation feature importance and the SHapley Additive +exPlanation, applied to a pristine support vector regression model designed to +predict band gaps at the GW level using 18 input features. Guided by +XML-derived individual feature importance, a simple framework is proposed to +construct reduced-feature predictive models. Model evaluations indicate that an +XML-guided compact model, consisting of the top five features, achieves +comparable accuracy to the pristine model on in-domain datasets while +demonstrating superior generalization with lower prediction errors on +out-of-domain data. Additionally, the study underscores the necessity for +eliminating strongly correlated features to prevent misinterpretation and +overestimation of feature importance before applying XML. This study highlights +XML's effectiveness in developing simplified yet highly accurate machine +learning models by clarifying feature roles. + +
+
+ comment: 9 pages, 4 figures, SI is included +
+
+
+
+
+ + ☆ InfoSEM: A Deep Generative Model with Informative Priors for Gene + Regulatory Network Inference ICLR 2025 + + +
+ Inferring Gene Regulatory Networks (GRNs) from gene expression data is +crucial for understanding biological processes. While supervised models are +reported to achieve high performance for this task, they rely on costly ground +truth (GT) labels and risk learning gene-specific biases, such as class +imbalances of GT interactions, rather than true regulatory mechanisms. To +address these issues, we introduce InfoSEM, an unsupervised generative model +that leverages textual gene embeddings as informative priors, improving GRN +inference without GT labels. InfoSEM can also integrate GT labels as an +additional prior when available, avoiding biases and further enhancing +performance. Additionally, we propose a biologically motivated benchmarking +framework that better reflects real-world applications such as biomarker +discovery and reveals learned biases of existing supervised methods. InfoSEM +outperforms existing models by 38.5% across four datasets using textual +embeddings prior and further boosts performance by 11.1% when integrating +labeled data as priors. + +
+
+ comment: ICLR 2025 AI4NA Oral, ICLR 2025 MLGenX Spotlight, ICLR 2025 LMRL +
+
+
+
+
+ + ☆ Generalized Interpolating Discrete Diffusion + + +
+ While state-of-the-art language models achieve impressive results through +next-token prediction, they have inherent limitations such as the inability to +revise already generated tokens. This has prompted exploration of alternative +approaches such as discrete diffusion. However, masked diffusion, which has +emerged as a popular choice due to its simplicity and effectiveness, +reintroduces this inability to revise words. To overcome this, we generalize +masked diffusion and derive the theoretical backbone of a family of general +interpolating discrete diffusion (GIDD) processes offering greater flexibility +in the design of the noising processes. Leveraging a novel diffusion ELBO, we +achieve compute-matched state-of-the-art performance in diffusion language +modeling. Exploiting GIDD's flexibility, we explore a hybrid approach combining +masking and uniform noise, leading to improved sample quality and unlocking the +ability for the model to correct its own mistakes, an area where autoregressive +models notoriously have struggled. Our code and models are open-source: +https://github.com/dvruette/gidd/ + +
+
+
+
+
+ + ☆ Poisoning Bayesian Inference via Data Deletion and Replication + + +
+ Research in adversarial machine learning (AML) has shown that statistical +models are vulnerable to maliciously altered data. However, despite advances in +Bayesian machine learning models, most AML research remains concentrated on +classical techniques. Therefore, we focus on extending the white-box model +poisoning paradigm to attack generic Bayesian inference, highlighting its +vulnerability in adversarial contexts. A suite of attacks are developed that +allow an attacker to steer the Bayesian posterior toward a target distribution +through the strategic deletion and replication of true observations, even when +only sampling access to the posterior is available. Analytic properties of +these algorithms are proven and their performance is empirically examined in +both synthetic and real-world scenarios. With relatively little effort, the +attacker is able to substantively alter the Bayesian's beliefs and, by +accepting more risk, they can mold these beliefs to their will. By carefully +constructing the adversarial posterior, surgical poisoning is achieved such +that only targeted inferences are corrupted and others are minimally disturbed. + +
+
+
+
+
+ + ☆ Know Thy Judge: On the Robustness Meta-Evaluation of LLM Safety Judges ICLR'25 + + +
+ Large Language Model (LLM) based judges form the underpinnings of key safety +evaluation processes such as offline benchmarking, automated red-teaming, and +online guardrailing. This widespread requirement raises the crucial question: +can we trust the evaluations of these evaluators? In this paper, we highlight +two critical challenges that are typically overlooked: (i) evaluations in the +wild where factors like prompt sensitivity and distribution shifts can affect +performance and (ii) adversarial attacks that target the judge. We highlight +the importance of these through a study of commonly used safety judges, showing +that small changes such as the style of the model output can lead to jumps of +up to 0.24 in the false negative rate on the same dataset, whereas adversarial +attacks on the model generation can fool some judges into misclassifying 100% +of harmful generations as safe ones. These findings reveal gaps in commonly +used meta-evaluation benchmarks and weaknesses in the robustness of current LLM +judges, indicating that low attack success under certain judges could create a +false sense of security. + +
+
+ comment: Accepted to the ICBINB Workshop at ICLR'25 +
+
+
+
+
+ + ☆ DAST: Difficulty-Adaptive Slow-Thinking for Large Reasoning Models + + +
+ Recent advancements in slow-thinking reasoning models have shown exceptional +performance in complex reasoning tasks. However, these models often exhibit +overthinking-generating redundant reasoning steps for simple problems, leading +to excessive computational resource usage. While current mitigation strategies +uniformly reduce reasoning tokens, they risk degrading performance on +challenging tasks that require extended reasoning. This paper introduces +Difficulty-Adaptive Slow-Thinking (DAST), a novel framework that enables models +to autonomously adjust the length of Chain-of-Thought(CoT) based on problem +difficulty. We first propose a Token Length Budget (TLB) metric to quantify +difficulty, then leveraging length-aware reward shaping and length preference +optimization to implement DAST. DAST penalizes overlong responses for simple +tasks while incentivizing sufficient reasoning for complex problems. +Experiments on diverse datasets and model scales demonstrate that DAST +effectively mitigates overthinking (reducing token usage by over 30\% on +average) while preserving reasoning accuracy on complex problems. + +
+
+ comment: working in progress +
+
+
+
+
+ + ☆ An artificially intelligent magnetic resonance spectroscopy + quantification method: Comparison between QNet and LCModel on the cloud + computing platform CloudBrain-MRS + + +
+ Objctives: This work aimed to statistically compare the metabolite +quantification of human brain magnetic resonance spectroscopy (MRS) between the +deep learning method QNet and the classical method LCModel through an +easy-to-use intelligent cloud computing platform CloudBrain-MRS. Materials and +Methods: In this retrospective study, two 3 T MRI scanners Philips Ingenia and +Achieva collected 61 and 46 in vivo 1H magnetic resonance (MR) spectra of +healthy participants, respectively, from the brain region of pregenual anterior +cingulate cortex from September to October 2021. The analyses of Bland-Altman, +Pearson correlation and reasonability were performed to assess the degree of +agreement, linear correlation and reasonability between the two quantification +methods. Results: Fifteen healthy volunteers (12 females and 3 males, age +range: 21-35 years, mean age/standard deviation = 27.4/3.9 years) were +recruited. The analyses of Bland-Altman, Pearson correlation and reasonability +showed high to good consistency and very strong to moderate correlation between +the two methods for quantification of total N-acetylaspartate (tNAA), total +choline (tCho), and inositol (Ins) (relative half interval of limits of +agreement = 3.04%, 9.3%, and 18.5%, respectively; Pearson correlation +coefficient r = 0.775, 0.927, and 0.469, respectively). In addition, +quantification results of QNet are more likely to be closer to the previous +reported average values than those of LCModel. Conclusion: There were high or +good degrees of consistency between the quantification results of QNet and +LCModel for tNAA, tCho, and Ins, and QNet generally has more reasonable +quantification than LCModel. + +
+
+
+
+
+ + ☆ PALo: Learning Posture-Aware Locomotion for Quadruped Robots + + +
+ With the rapid development of embodied intelligence, locomotion control of +quadruped robots on complex terrains has become a research hotspot. Unlike +traditional locomotion control approaches focusing solely on velocity tracking, +we pursue to balance the agility and robustness of quadruped robots on diverse +and complex terrains. To this end, we propose an end-to-end deep reinforcement +learning framework for posture-aware locomotion named PALo, which manages to +handle simultaneous linear and angular velocity tracking and real-time +adjustments of body height, pitch, and roll angles. In PALo, the locomotion +control problem is formulated as a partially observable Markov decision +process, and an asymmetric actor-critic architecture is adopted to overcome the +sim-to-real challenge. Further, by incorporating customized training curricula, +PALo achieves agile posture-aware locomotion control in simulated environments +and successfully transfers to real-world settings without fine-tuning, allowing +real-time control of the quadruped robot's locomotion and body posture across +challenging terrains. Through in-depth experimental analysis, we identify the +key components of PALo that contribute to its performance, further validating +the effectiveness of the proposed method. The results of this study provide new +possibilities for the low-level locomotion control of quadruped robots in +higher dimensional command spaces and lay the foundation for future research on +upper-level modules for embodied intelligence. + +
+
+
+
+
+ + ☆ Reproducibility Assessment of Magnetic Resonance Spectroscopy of + Pregenual Anterior Cingulate Cortex across Sessions and Vendors via the Cloud + Computing Platform CloudBrain-MRS + + +
+ Given the need to elucidate the mechanisms underlying illnesses and their +treatment, as well as the lack of harmonization of acquisition and +post-processing protocols among different magnetic resonance system vendors, +this work is to determine if metabolite concentrations obtained from different +sessions, machine models and even different vendors of 3 T scanners can be +highly reproducible and be pooled for diagnostic analysis, which is very +valuable for the research of rare diseases. Participants underwent magnetic +resonance imaging (MRI) scanning once on two separate days within one week (one +session per day, each session including two proton magnetic resonance +spectroscopy (1H-MRS) scans with no more than a 5-minute interval between scans +(no off-bed activity)) on each machine. were analyzed for reliability of +within- and between- sessions using the coefficient of variation (CV) and +intraclass correlation coefficient (ICC), and for reproducibility of across the +machines using correlation coefficient. As for within- and between- session, +all CV values for a group of all the first or second scans of a session, or for +a session were almost below 20%, and most of the ICCs for metabolites range +from moderate (0.4-0.59) to excellent (0.75-1), indicating high data +reliability. When it comes to the reproducibility across the three scanners, +all Pearson correlation coefficients across the three machines approached 1 +with most around 0.9, and majority demonstrated statistical significance +(P<0.01). Additionally, the intra-vendor reproducibility was greater than the +inter-vendor ones. + +
+
+
+
+
+ + ☆ Privacy Preserving and Robust Aggregation for Cross-Silo Federated + Learning in Non-IID Settings + + +
+ Federated Averaging remains the most widely used aggregation strategy in +federated learning due to its simplicity and scalability. However, its +performance degrades significantly in non-IID data settings, where client +distributions are highly imbalanced or skewed. Additionally, it relies on +clients transmitting metadata, specifically the number of training samples, +which introduces privacy risks and may conflict with regulatory frameworks like +the European GDPR. In this paper, we propose a novel aggregation strategy that +addresses these challenges by introducing class-aware gradient masking. Unlike +traditional approaches, our method relies solely on gradient updates, +eliminating the need for any additional client metadata, thereby enhancing +privacy protection. Furthermore, our approach validates and dynamically weights +client contributions based on class-specific importance, ensuring robustness +against non-IID distributions, convergence prevention, and backdoor attacks. +Extensive experiments on benchmark datasets demonstrate that our method not +only outperforms FedAvg and other widely accepted aggregation strategies in +non-IID settings but also preserves model integrity in adversarial scenarios. +Our results establish the effectiveness of gradient masking as a practical and +secure solution for federated learning. + +
+
+
+
+
+ + ☆ A Graph-Partitioning Based Continuous Optimization Approach to + Semi-supervised Clustering Problems + + +
+ Semi-supervised clustering is a basic problem in various applications. Most +existing methods require knowledge of the ideal cluster number, which is often +difficult to obtain in practice. Besides, satisfying the must-link constraints +is another major challenge for these methods. In this work, we view the +semi-supervised clustering task as a partitioning problem on a graph associated +with the given dataset, where the similarity matrix includes a scaling +parameter to reflect the must-link constraints. Utilizing a relaxation +technique, we formulate the graph partitioning problem into a continuous +optimization model that does not require the exact cluster number, but only an +overestimate of it. We then propose a block coordinate descent algorithm to +efficiently solve this model, and establish its convergence result. Based on +the obtained solution, we can construct the clusters that theoretically meet +the must-link constraints under mild assumptions. Furthermore, we verify the +effectiveness and efficiency of our proposed method through comprehensive +numerical experiments. + +
+
+
+
+
+ + ☆ FORTALESA: Fault-Tolerant Reconfigurable Systolic Array for DNN + Inference + + +
+ The emergence of Deep Neural Networks (DNNs) in mission- and safety-critical +applications brings their reliability to the front. High performance demands of +DNNs require the use of specialized hardware accelerators. Systolic array +architecture is widely used in DNN accelerators due to its parallelism and +regular structure. This work presents a run-time reconfigurable systolic array +architecture with three execution modes and four implementation options. All +four implementations are evaluated in terms of resource utilization, +throughput, and fault tolerance improvement. The proposed architecture is used +for reliability enhancement of DNN inference on systolic array through +heterogeneous mapping of different network layers to different execution modes. +The approach is supported by a novel reliability assessment method based on +fault propagation analysis. It is used for the exploration of the appropriate +execution mode-layer mapping for DNN inference. The proposed architecture +efficiently protects registers and MAC units of systolic array PEs from +transient and permanent faults. The reconfigurability feature enables a speedup +of up to $3\times$, depending on layer vulnerability. Furthermore, it requires +$6\times$ less resources compared to static redundancy and $2.5\times$ less +resources compared to the previously proposed solution for transient faults. + +
+
+ comment: 11 pages, 15 figures +
+
+
+
+
+ + ☆ Determinant Estimation under Memory Constraints and Neural Scaling Laws + + +
+ Calculating or accurately estimating log-determinants of large positive +semi-definite matrices is of fundamental importance in many machine learning +tasks. While its cubic computational complexity can already be prohibitive, in +modern applications, even storing the matrices themselves can pose a memory +bottleneck. To address this, we derive a novel hierarchical algorithm based on +block-wise computation of the LDL decomposition for large-scale log-determinant +calculation in memory-constrained settings. In extreme cases where matrices are +highly ill-conditioned, accurately computing the full matrix itself may be +infeasible. This is particularly relevant when considering kernel matrices at +scale, including the empirical Neural Tangent Kernel (NTK) of neural networks +trained on large datasets. Under the assumption of neural scaling laws in the +test error, we show that the ratio of pseudo-determinants satisfies a power-law +relationship, allowing us to derive corresponding scaling laws. This enables +accurate estimation of NTK log-determinants from a tiny fraction of the full +dataset; in our experiments, this results in a $\sim$100,000$\times$ speedup +with improved accuracy over competing approximations. Using these techniques, +we successfully estimate log-determinants for dense matrices of extreme sizes, +which were previously deemed intractable and inaccessible due to their enormous +scale and computational demands. + +
+
+
+
+
+ + ☆ AOLO: Analysis and Optimization For Low-Carbon Oriented Wireless Large + Language Model Services + + +
+ Recent advancements in large language models (LLMs) have led to their +widespread adoption and large-scale deployment across various domains. However, +their environmental impact, particularly during inference, has become a growing +concern due to their substantial energy consumption and carbon footprint. +Existing research has focused on inference computation alone, overlooking the +analysis and optimization of carbon footprint in network-aided LLM service +systems. To address this gap, we propose AOLO, a framework for analysis and +optimization for low-carbon oriented wireless LLM services. AOLO introduces a +comprehensive carbon footprint model that quantifies greenhouse gas emissions +across the entire LLM service chain, including computational inference and +wireless communication. Furthermore, we formulate an optimization problem aimed +at minimizing the overall carbon footprint, which is solved through joint +optimization of inference outputs and transmit power under +quality-of-experience and system performance constraints. To achieve this joint +optimization, we leverage the energy efficiency of spiking neural networks +(SNNs) by adopting SNN as the actor network and propose a low-carbon-oriented +optimization algorithm, i.e., SNN-based deep reinforcement learning (SDRL). +Comprehensive simulations demonstrate that SDRL algorithm significantly reduces +overall carbon footprint, achieving an 18.77% reduction compared to the +benchmark soft actor-critic, highlighting its potential for enabling more +sustainable LLM inference services. + +
+
+
+
+
+ + ☆ Learning Transformer-based World Models with Contrastive Predictive + Coding + + +
+ The DreamerV3 algorithm recently obtained remarkable performance across +diverse environment domains by learning an accurate world model based on +Recurrent Neural Networks (RNNs). Following the success of model-based +reinforcement learning algorithms and the rapid adoption of the Transformer +architecture for its superior training efficiency and favorable scaling +properties, recent works such as STORM have proposed replacing RNN-based world +models with Transformer-based world models using masked self-attention. +However, despite the improved training efficiency of these methods, their +impact on performance remains limited compared to the Dreamer algorithm, +struggling to learn competitive Transformer-based world models. In this work, +we show that the next state prediction objective adopted in previous approaches +is insufficient to fully exploit the representation capabilities of +Transformers. We propose to extend world model predictions to longer time +horizons by introducing TWISTER (Transformer-based World model wIth contraSTivE +Representations), a world model using action-conditioned Contrastive Predictive +Coding to learn high-level temporal feature representations and improve the +agent performance. TWISTER achieves a human-normalized mean score of 162% on +the Atari 100k benchmark, setting a new record among state-of-the-art methods +that do not employ look-ahead search. + +
+
+
+
+
+ + ☆ Training-Free Graph Filtering via Multimodal Feature Refinement for + Extremely Fast Multimodal Recommendation + + +
+ Multimodal recommender systems improve the performance of canonical +recommender systems with no item features by utilizing diverse content types +such as text, images, and videos, while alleviating inherent sparsity of +user-item interactions and accelerating user engagement. However, current +neural network-based models often incur significant computational overhead due +to the complex training process required to learn and integrate information +from multiple modalities. To overcome this limitation, we propose +MultiModal-Graph Filtering (MM-GF), a training-free method based on the notion +of graph filtering (GF) for efficient and accurate multimodal recommendations. +Specifically, MM-GF first constructs multiple similarity graphs through +nontrivial multimodal feature refinement such as robust scaling and vector +shifting by addressing the heterogeneous characteristics across modalities. +Then, MM-GF optimally fuses multimodal information using linear low-pass +filters across different modalities. Extensive experiments on real-world +benchmark datasets demonstrate that MM-GF not only improves recommendation +accuracy by up to 13.35% compared to the best competitor but also dramatically +reduces computational costs by achieving the runtime of less than 10 seconds. + +
+
+ comment: 10 pages, 6 figures, 6 tables +
+
+
+
+
+ + ☆ Temporal Analysis of NetFlow Datasets for Network Intrusion Detection + Systems + + +
+ This paper investigates the temporal analysis of NetFlow datasets for machine +learning (ML)-based network intrusion detection systems (NIDS). Although many +previous studies have highlighted the critical role of temporal features, such +as inter-packet arrival time and flow length/duration, in NIDS, the currently +available NetFlow datasets for NIDS lack these temporal features. This study +addresses this gap by creating and making publicly available a set of NetFlow +datasets that incorporate these temporal features [1]. With these temporal +features, we provide a comprehensive temporal analysis of NetFlow datasets by +examining the distribution of various features over time and presenting +time-series representations of NetFlow features. This temporal analysis has not +been previously provided in the existing literature. We also borrowed an idea +from signal processing, time frequency analysis, and tested it to see how +different the time frequency signal presentations (TFSPs) are for various +attacks. The results indicate that many attacks have unique patterns, which +could help ML models to identify them more easily. + +
+
+
+
+
+ + ☆ Speculative MoE: Communication Efficient Parallel MoE Inference with + Speculative Token and Expert Pre-scheduling + + +
+ MoE (Mixture of Experts) prevails as a neural architecture that can scale +modern transformer-based LLMs (Large Language Models) to unprecedented scales. +Nevertheless, large MoEs' great demands of computing power, memory capacity and +memory bandwidth make scalable serving a fundamental challenge and efficient +parallel inference has become a requisite to attain adequate throughput under +latency constraints. DeepSpeed-MoE, one state-of-the-art MoE inference +framework, adopts a 3D-parallel paradigm including EP (Expert Parallelism), TP +(Tensor Parallel) and DP (Data Parallelism). However, our analysis shows +DeepSpeed-MoE's inference efficiency is largely bottlenecked by EP, which is +implemented with costly all-to-all collectives to route token activation. Our +work aims to boost DeepSpeed-MoE by strategically reducing EP's communication +overhead with a technique named Speculative MoE. Speculative MoE has two +speculative parallelization schemes, speculative token shuffling and +speculative expert grouping, which predict outstanding tokens' expert routing +paths and pre-schedule tokens and experts across devices to losslessly trim +EP's communication volume. Besides DeepSpeed-MoE, we also build Speculative MoE +into a prevailing MoE inference engine SGLang. Experiments show Speculative MoE +can significantly boost state-of-the-art MoE inference frameworks on fast +homogeneous and slow heterogeneous interconnects. + +
+
+
+
+
+ + ☆ Time-varying Factor Augmented Vector Autoregression with Grouped Sparse + Autoencoder + + +
+ Recent economic events, including the global financial crisis and COVID-19 +pandemic, have exposed limitations in linear Factor Augmented Vector +Autoregressive (FAVAR) models for forecasting and structural analysis. +Nonlinear dimension techniques, particularly autoencoders, have emerged as +promising alternatives in a FAVAR framework, but challenges remain in +identifiability, interpretability, and integration with traditional nonlinear +time series methods. We address these challenges through two contributions. +First, we introduce a Grouped Sparse autoencoder that employs the +Spike-and-Slab Lasso prior, with parameters under this prior being shared +across variables of the same economic category, thereby achieving +semi-identifiability and enhancing model interpretability. Second, we +incorporate time-varying parameters into the VAR component to better capture +evolving economic dynamics. Our empirical application to the US economy +demonstrates that the Grouped Sparse autoencoder produces more interpretable +factors through its parsimonious structure; and its combination with +time-varying parameter VAR shows superior performance in both point and density +forecasting. Impulse response analysis reveals that monetary policy shocks +during recessions generate more moderate responses with higher uncertainty +compared to expansionary periods. + +
+
+
+
+
+ + ☆ Dedicated Feedback and Edit Models Empower Inference-Time Scaling for + Open-Ended General-Domain Tasks + + +
+ Inference-Time Scaling has been critical to the success of recent models such +as OpenAI o1 and DeepSeek R1. However, many techniques used to train models for +inference-time scaling require tasks to have answers that can be verified, +limiting their application to domains such as math, coding and logical +reasoning. We take inspiration from how humans make first attempts, ask for +detailed feedback from others and make improvements based on such feedback +across a wide spectrum of open-ended endeavors. To this end, we collect data +for and train dedicated Feedback and Edit Models that are capable of performing +inference-time scaling for open-ended general-domain tasks. In our setup, one +model generates an initial response, which are given feedback by a second +model, that are then used by a third model to edit the response. We show that +performance on Arena Hard, a benchmark strongly predictive of Chatbot Arena Elo +can be boosted by scaling the number of initial response drafts, effective +feedback and edited responses. When scaled optimally, our setup based on 70B +models from the Llama 3 family can reach SoTA performance on Arena Hard at 92.7 +as of 5 Mar 2025, surpassing OpenAI o1-preview-2024-09-12 with 90.4 and +DeepSeek R1 with 92.3. + +
+
+ comment: 22 pages, 2 figures +
+
+
+
+
+ + ☆ How can representation dimension dominate structurally pruned LLMs? ICLR 2025 + + +
+ Pruning assumes a subnetwork exists in the original deep neural network, +which can achieve comparative model performance with less computation than the +original. However, it is unclear how the model performance varies with the +different subnetwork extractions. In this paper, we choose the representation +dimension (or embedding dimension, model dimension, the dimension of the +residual stream in the relevant literature) as the entry point to this issue. +We investigate the linear transformations in the LLM transformer blocks and +consider a specific structured pruning approach, SliceGPT, to extract the +subnetworks of different representation dimensions. We mechanistically analyse +the activation flow during the model forward passes, and find the +representation dimension dominates the linear transformations, model +predictions, and, finally, the model performance. Explicit analytical relations +are given to calculate the pruned model performance (perplexity and accuracy) +without actual evaluation, and are empirically validated with +Llama-3-8B-Instruct and Phi-3-mini-4k-Instruct. + +
+
+ comment: ICLR 2025 Workshop on Sparsity in LLMs (SLLM) +
+
+
+
+
+ + ☆ FILM: Framework for Imbalanced Learning Machines based on a new unbiased + performance measure and a new ensemble-based technique + + +
+ This research addresses the challenges of handling unbalanced datasets for +binary classification tasks. In such scenarios, standard evaluation metrics are +often biased by the disproportionate representation of the minority class. +Conducting experiments across seven datasets, we uncovered inconsistencies in +evaluation metrics when determining the model that outperforms others for each +binary classification problem. This justifies the need for a metric that +provides a more consistent and unbiased evaluation across unbalanced datasets, +thereby supporting robust model selection. To mitigate this problem, we propose +a novel metric, the Unbiased Integration Coefficients (UIC), which exhibits +significantly reduced bias ($p < 10^{-4}$) towards the minority class compared +to conventional metrics. The UIC is constructed by aggregating existing metrics +while penalising those more prone to imbalance. In addition, we introduce the +Identical Partitions for Imbalance Problems (IPIP) algorithm for imbalanced ML +problems, an ensemble-based approach. Our experimental results show that IPIP +outperforms other baseline imbalance-aware approaches using Random Forest and +Logistic Regression models in three out of seven datasets as assessed by the +UIC metric, demonstrating its effectiveness in addressing imbalanced data +challenges in binary classification tasks. This new framework for dealing with +imbalanced datasets is materialized in the FILM (Framework for Imbalanced +Learning Machines) R Package, accessible at https://github.com/antoniogt/FILM. + +
+
+
+
+
+ + ☆ Causally Reliable Concept Bottleneck Models + + +
+ Concept-based models are an emerging paradigm in deep learning that +constrains the inference process to operate through human-interpretable +concepts, facilitating explainability and human interaction. However, these +architectures, on par with popular opaque neural models, fail to account for +the true causal mechanisms underlying the target phenomena represented in the +data. This hampers their ability to support causal reasoning tasks, limits +out-of-distribution generalization, and hinders the implementation of fairness +constraints. To overcome these issues, we propose \emph{Causally reliable +Concept Bottleneck Models} (C$^2$BMs), a class of concept-based architectures +that enforce reasoning through a bottleneck of concepts structured according to +a model of the real-world causal mechanisms. We also introduce a pipeline to +automatically learn this structure from observational data and +\emph{unstructured} background knowledge (e.g., scientific literature). +Experimental evidence suggest that C$^2$BM are more interpretable, causally +reliable, and improve responsiveness to interventions w.r.t. standard opaque +and concept-based models, while maintaining their accuracy. + +
+
+
+
+
+ + ☆ A Generalist Cross-Domain Molecular Learning Framework for + Structure-Based Drug Discovery + + +
+ Structure-based drug discovery (SBDD) is a systematic scientific process that +develops new drugs by leveraging the detailed physical structure of the target +protein. Recent advancements in pre-trained models for biomolecules have +demonstrated remarkable success across various biochemical applications, +including drug discovery and protein engineering. However, in most approaches, +the pre-trained models primarily focus on the characteristics of either small +molecules or proteins, without delving into their binding interactions which +are essential cross-domain relationships pivotal to SBDD. To fill this gap, we +propose a general-purpose foundation model named BIT (an abbreviation for +Biomolecular Interaction Transformer), which is capable of encoding a range of +biochemical entities, including small molecules, proteins, and protein-ligand +complexes, as well as various data formats, encompassing both 2D and 3D +structures. Specifically, we introduce Mixture-of-Domain-Experts (MoDE) to +handle the biomolecules from diverse biochemical domains and +Mixture-of-Structure-Experts (MoSE) to capture positional dependencies in the +molecular structures. The proposed mixture-of-experts approach enables BIT to +achieve both deep fusion and domain-specific encoding, effectively capturing +fine-grained molecular interactions within protein-ligand complexes. Then, we +perform cross-domain pre-training on the shared Transformer backbone via +several unified self-supervised denoising tasks. Experimental results on +various benchmarks demonstrate that BIT achieves exceptional performance in +downstream tasks, including binding affinity prediction, structure-based +virtual screening, and molecular property prediction. + +
+
+
+
+
+ + ☆ Learning Causal Response Representations through Direct Effect Analysis + + +
+ We propose a novel approach for learning causal response representations. Our +method aims to extract directions in which a multidimensional outcome is most +directly caused by a treatment variable. By bridging conditional independence +testing with causal representation learning, we formulate an optimisation +problem that maximises the evidence against conditional independence between +the treatment and outcome, given a conditioning set. This formulation employs +flexible regression models tailored to specific applications, creating a +versatile framework. The problem is addressed through a generalised eigenvalue +decomposition. We show that, under mild assumptions, the distribution of the +largest eigenvalue can be bounded by a known $F$-distribution, enabling +testable conditional independence. We also provide theoretical guarantees for +the optimality of the learned representation in terms of signal-to-noise ratio +and Fisher information maximisation. Finally, we demonstrate the empirical +effectiveness of our approach in simulation and real-world experiments. Our +results underscore the utility of this framework in uncovering direct causal +effects within complex, multivariate settings. + +
+
+ comment: 32 pages, 15 figures, stat.ML +
+
+
+
+
+ + scDD: Latent Codes Based scRNA-seq Dataset Distillation with Foundation + Model Knowledge + + +
+ Single-cell RNA sequencing (scRNA-seq) technology has profiled hundreds of +millions of human cells across organs, diseases, development and perturbations +to date. However, the high-dimensional sparsity, batch effect noise, category +imbalance, and ever-increasing data scale of the original sequencing data pose +significant challenges for multi-center knowledge transfer, data fusion, and +cross-validation between scRNA-seq datasets. To address these barriers, (1) we +first propose a latent codes-based scRNA-seq dataset distillation framework +named scDD, which transfers and distills foundation model knowledge and +original dataset information into a compact latent space and generates +synthetic scRNA-seq dataset by a generator to replace the original dataset. +Then, (2) we propose a single-step conditional diffusion generator named SCDG, +which perform single-step gradient back-propagation to help scDD optimize +distillation quality and avoid gradient decay caused by multi-step +back-propagation. Meanwhile, SCDG ensures the scRNA-seq data characteristics +and inter-class discriminability of the synthetic dataset through flexible +conditional control and generation quality assurance. Finally, we propose a +comprehensive benchmark to evaluate the performance of scRNA-seq dataset +distillation in different data analysis tasks. It is validated that our +proposed method can achieve 7.61% absolute and 15.70% relative improvement over +previous state-of-the-art methods on average task. + +
+
+
+
+
+ + ☆ EDCA -- An Evolutionary Data-Centric AutoML Framework for Efficient + Pipelines + + +
+ Automated Machine Learning (AutoML) gained popularity due to the increased +demand for Machine Learning (ML) specialists, allowing them to apply ML +techniques effortlessly and quickly. AutoML implementations use optimisation +methods to identify the most effective ML solution for a given dataset, aiming +to improve one or more predefined metrics. However, most implementations focus +on model selection and hyperparameter tuning. Despite being an important factor +in obtaining high-performance ML systems, data quality is usually an overlooked +part of AutoML and continues to be a manual and time-consuming task. This work +presents EDCA, an Evolutionary Data Centric AutoML framework. In addition to +the traditional tasks such as selecting the best models and hyperparameters, +EDCA enhances the given data by optimising data processing tasks such as data +reduction and cleaning according to the problems' needs. All these steps create +an ML pipeline that is optimised by an evolutionary algorithm. To assess its +effectiveness, EDCA was compared to FLAML and TPOT, two frameworks at the top +of the AutoML benchmarks. The frameworks were evaluated in the same conditions +using datasets from AMLB classification benchmarks. EDCA achieved statistically +similar results in performance to FLAML and TPOT but used significantly less +data to train the final solutions. Moreover, EDCA experimental results reveal +that a good performance can be achieved using less data and efficient ML +algorithm aspects that align with Green AutoML guidelines + +
+
+
+
+
+ + ☆ Large Language Models for Zero-shot Inference of Causal Structures in + Biology ICLR 2025 + + +
+ Genes, proteins and other biological entities influence one another via +causal molecular networks. Causal relationships in such networks are mediated +by complex and diverse mechanisms, through latent variables, and are often +specific to cellular context. It remains challenging to characterise such +networks in practice. Here, we present a novel framework to evaluate large +language models (LLMs) for zero-shot inference of causal relationships in +biology. In particular, we systematically evaluate causal claims obtained from +an LLM using real-world interventional data. This is done over one hundred +variables and thousands of causal hypotheses. Furthermore, we consider several +prompting and retrieval-augmentation strategies, including large, and +potentially conflicting, collections of scientific articles. Our results show +that with tailored augmentation and prompting, even relatively small LLMs can +capture meaningful aspects of causal structure in biological systems. This +supports the notion that LLMs could act as orchestration tools in biological +discovery, by helping to distil current knowledge in ways amenable to +downstream analysis. Our approach to assessing LLMs with respect to +experimental data is relevant for a broad range of problems at the intersection +of causal learning, LLMs and scientific discovery. + +
+
+ comment: ICLR 2025 Workshop on Machine Learning for Genomics Explorations +
+
+
+
+
+ + ☆ TRANSIT your events into a new mass: Fast background interpolation for + weakly-supervised anomaly searches + + +
+ We introduce a new model for conditional and continuous data morphing called +TRansport Adversarial Network for Smooth InTerpolation (TRANSIT). We apply it +to create a background data template for weakly-supervised searches at the LHC. +The method smoothly transforms sideband events to match signal region mass +distributions. We demonstrate the performance of TRANSIT using the LHC Olympics +R\&D dataset. The model captures non-linear mass correlations of features and +produces a template that offers a competitive anomaly sensitivity compared to +state-of-the-art transport-based template generators. Moreover, the +computational training time required for TRANSIT is an order of magnitude lower +than that of competing deep learning methods. This makes it ideal for analyses +that iterate over many signal regions and signal models. Unlike generative +models, which must learn a full probability density distribution, i.e., the +correlations between all the variables, the proposed transport model only has +to learn a smooth conditional shift of the distribution. This allows for a +simpler, more efficient residual architecture, enabling mass uncorrelated +features to pass the network unchanged while the mass correlated features are +adjusted accordingly. Furthermore, we show that the latent space of the model +provides a set of mass decorrelated features useful for anomaly detection +without background sculpting. + +
+
+ comment: 34 pages, 14 figures +
+
+
+
+
+ + ☆ The Challenge of Identifying the Origin of Black-Box Large Language + Models + + +
+ The tremendous commercial potential of large language models (LLMs) has +heightened concerns about their unauthorized use. Third parties can customize +LLMs through fine-tuning and offer only black-box API access, effectively +concealing unauthorized usage and complicating external auditing processes. +This practice not only exacerbates unfair competition, but also violates +licensing agreements. In response, identifying the origin of black-box LLMs is +an intrinsic solution to this issue. In this paper, we first reveal the +limitations of state-of-the-art passive and proactive identification methods +with experiments on 30 LLMs and two real-world black-box APIs. Then, we propose +the proactive technique, PlugAE, which optimizes adversarial token embeddings +in a continuous space and proactively plugs them into the LLM for tracing and +identification. The experiments show that PlugAE can achieve substantial +improvement in identifying fine-tuned derivatives. We further advocate for +legal frameworks and regulations to better address the challenges posed by the +unauthorized use of LLMs. + +
+
+
+
+
+ + ☆ InFL-UX: A Toolkit for Web-Based Interactive Federated Learning + + +
+ This paper presents InFL-UX, an interactive, proof-of-concept browser-based +Federated Learning (FL) toolkit designed to integrate user contributions +seamlessly into the machine learning (ML) workflow. InFL-UX enables users +across multiple devices to upload datasets, define classes, and collaboratively +train classification models directly in the browser using modern web +technologies. Unlike traditional FL toolkits, which often focus on backend +simulations, InFL-UX provides a simple user interface for researchers to +explore how users interact with and contribute to FL systems in real-world, +interactive settings. By prioritising usability and decentralised model +training, InFL-UX bridges the gap between FL and Interactive Machine Learning +(IML), empowering non-technical users to actively participate in ML +classification tasks. + +
+
+
+
+
+ + ☆ Provable Robust Overfitting Mitigation in Wasserstein Distributionally + Robust Optimization + + +
+ Wasserstein distributionally robust optimization (WDRO) optimizes against +worst-case distributional shifts within a specified uncertainty set, leading to +enhanced generalization on unseen adversarial examples, compared to standard +adversarial training which focuses on pointwise adversarial perturbations. +However, WDRO still suffers fundamentally from the robust overfitting problem, +as it does not consider statistical error. We address this gap by proposing a +novel robust optimization framework under a new uncertainty set for adversarial +noise via Wasserstein distance and statistical error via Kullback-Leibler +divergence, called the Statistically Robust WDRO. We establish a robust +generalization bound for the new optimization framework, implying that +out-of-distribution adversarial performance is at least as good as the +statistically robust training loss with high probability. Furthermore, we +derive conditions under which Stackelberg and Nash equilibria exist between the +learner and the adversary, giving an optimal robust model in certain sense. +Finally, through extensive experiments, we demonstrate that our method +significantly mitigates robust overfitting and enhances robustness within the +framework of WDRO. + +
+
+
+
+
+ + ☆ Explainable AI in Time-Sensitive Scenarios: Prefetched Offline + Explanation Model + + +
+ As predictive machine learning models become increasingly adopted and +advanced, their role has evolved from merely predicting outcomes to actively +shaping them. This evolution has underscored the importance of Trustworthy AI, +highlighting the necessity to extend our focus beyond mere accuracy and toward +a comprehensive understanding of these models' behaviors within the specific +contexts of their applications. To further progress in explainability, we +introduce Poem, Prefetched Offline Explanation Model, a model-agnostic, local +explainability algorithm for image data. The algorithm generates exemplars, +counterexemplars and saliency maps to provide quick and effective explanations +suitable for time-sensitive scenarios. Leveraging an existing local algorithm, +\poem{} infers factual and counterfactual rules from data to create +illustrative examples and opposite scenarios with an enhanced stability by +design. A novel mechanism then matches incoming test points with an explanation +base and produces diverse exemplars, informative saliency maps and believable +counterexemplars. Experimental results indicate that Poem outperforms its +predecessor Abele in speed and ability to generate more nuanced and varied +exemplars alongside more insightful saliency maps and valuable +counterexemplars. + +
+
+
+
+
+ + ☆ Towards Autonomous Reinforcement Learning for Real-World Robotic + Manipulation with Large Language Models + + +
+ Recent advancements in Large Language Models (LLMs) and Visual Language +Models (VLMs) have significantly impacted robotics, enabling high-level +semantic motion planning applications. Reinforcement Learning (RL), a +complementary paradigm, enables agents to autonomously optimize complex +behaviors through interaction and reward signals. However, designing effective +reward functions for RL remains challenging, especially in real-world tasks +where sparse rewards are insufficient and dense rewards require elaborate +design. In this work, we propose Autonomous Reinforcement learning for Complex +HumanInformed Environments (ARCHIE), an unsupervised pipeline leveraging GPT-4, +a pre-trained LLM, to generate reward functions directly from natural language +task descriptions. The rewards are used to train RL agents in simulated +environments, where we formalize the reward generation process to enhance +feasibility. Additionally, GPT-4 automates the coding of task success criteria, +creating a fully automated, one-shot procedure for translating human-readable +text into deployable robot skills. Our approach is validated through extensive +simulated experiments on single-arm and bi-manual manipulation tasks using an +ABB YuMi collaborative robot, highlighting its practicality and effectiveness. +Tasks are demonstrated on the real robot setup. + +
+
+
+
+
+ + ☆ A General Framework for Scalable UE-AP Association in User-Centric + Cell-Free Massive MIMO based on Recurrent Neural Networks + + +
+ This study addresses the challenge of access point (AP) and user equipment +(UE) association in cell-free massive MIMO networks. It introduces a deep +learning algorithm leveraging Bidirectional Long Short-Term Memory cells and a +hybrid probabilistic methodology for weight updating. This approach enhances +scalability by adapting to variations in the number of UEs without requiring +retraining. Additionally, the study presents a training methodology that +improves scalability not only with respect to the number of UEs but also to the +number of APs. Furthermore, a variant of the proposed AP-UE algorithm ensures +robustness against pilot contamination effects, a critical issue arising from +pilot reuse in channel estimation. Extensive numerical results validate the +effectiveness and adaptability of the proposed methods, demonstrating their +superiority over widely used heuristic alternatives. + +
+
+ comment: submitted to IEEE journal +
+
+
+
+
+ + ☆ Frequency Hopping Synchronization by Reinforcement Learning for + Satellite Communication System + + +
+ Satellite communication systems (SCSs) used for tactical purposes require +robust security and anti-jamming capabilities, making frequency hopping (FH) a +powerful option. However, the current FH systems face challenges due to +significant interference from other devices and the considerable path loss +inherent in satellite communication. This misalignment leads to inefficient +synchronization, crucial for maintaining reliable communication. Traditional +methods, such as those employing long short-term memory (LSTM) networks, have +made improvements, but they still struggle in dynamic conditions of satellite +environments. This paper presents a novel method for synchronizing FH signals +in tactical SCSs by combining serial search and reinforcement learning to +achieve coarse and fine acquisition, respectively. The mathematical analysis +and simulation results demonstrate that the proposed method reduces the average +number of hops required for synchronization by 58.17% and mean squared error +(MSE) of the uplink hop timing estimation by 76.95%, as compared to the +conventional serial search method. Comparing with the early late gate +synchronization method based on serial search and use of LSTM network, the +average number of hops for synchronization is reduced by 12.24% and the MSE by +18.5%. + +
+
+ comment: 18pages, 5figures +
+
+
+
+
+ + ☆ Bi-Lipschitz Ansatz for Anti-Symmetric Functions + + +
+ Motivated by applications for simulating quantum many body functions, we +propose a new universal ansatz for approximating anti-symmetric functions. The +main advantage of this ansatz over previous alternatives is that it is +bi-Lipschitz with respect to a naturally defined metric. As a result, we are +able to obtain quantitative approximation results for approximation of +Lipschitz continuous antisymmetric functions. Moreover, we provide preliminary +experimental evidence to the improved performance of this ansatz for learning +antisymmetric functions. + +
+
+
+
+
+ + ☆ Knowledge Retention for Continual Model-Based Reinforcement Learning + + +
+ We propose DRAGO, a novel approach for continual model-based reinforcement +learning aimed at improving the incremental development of world models across +a sequence of tasks that differ in their reward functions but not the state +space or dynamics. DRAGO comprises two key components: Synthetic Experience +Rehearsal, which leverages generative models to create synthetic experiences +from past tasks, allowing the agent to reinforce previously learned dynamics +without storing data, and Regaining Memories Through Exploration, which +introduces an intrinsic reward mechanism to guide the agent toward revisiting +relevant states from prior tasks. Together, these components enable the agent +to maintain a comprehensive and continually developing world model, +facilitating more effective learning and adaptation across diverse +environments. Empirical evaluations demonstrate that DRAGO is able to preserve +knowledge across tasks, achieving superior performance in various continual +learning scenarios. + +
+
+
+
+
+ + ☆ RCRank: Multimodal Ranking of Root Causes of Slow Queries in Cloud + Database Systems VLDB 2025 + + +
+ With the continued migration of storage to cloud database systems,the impact +of slow queries in such systems on services and user experience is increasing. +Root-cause diagnosis plays an indispensable role in facilitating slow-query +detection and revision. This paper proposes a method capable of both +identifying possible root cause types for slow queries and ranking these +according to their potential for accelerating slow queries. This enables +prioritizing root causes with the highest impact, in turn improving slow-query +revision effectiveness. To enable more accurate and detailed diagnoses, we +propose the multimodal Ranking for the Root Causes of slow queries (RCRank) +framework, which formulates root cause analysis as a multimodal machine +learning problem and leverages multimodal information from query statements, +execution plans, execution logs, and key performance indicators. To obtain +expressive embeddings from its heterogeneous multimodal input, RCRank +integrates self-supervised pre-training that enhances cross-modal alignment and +task relevance. Next, the framework integrates root-cause-adaptive cross +Transformers that enable adaptive fusion of multimodal features with varying +characteristics. Finally, the framework offers a unified model that features an +impact-aware training objective for identifying and ranking root causes. We +report on experiments on real and synthetic datasets, finding that RCRank is +capable of consistently outperforming the state-of-the-art methods at root +cause identification and ranking according to a range of metrics. + +
+
+ comment: Accepted by VLDB 2025 +
+
+
+
+
+ + ☆ How to Mitigate Overfitting in Weak-to-strong Generalization? + + +
+ Aligning powerful AI models on tasks that surpass human evaluation +capabilities is the central problem of \textbf{superalignment}. To address this +problem, weak-to-strong generalization aims to elicit the capabilities of +strong models through weak supervisors and ensure that the behavior of strong +models aligns with the intentions of weak supervisors without unsafe behaviors +such as deception. Although weak-to-strong generalization exhibiting certain +generalization capabilities, strong models exhibit significant overfitting in +weak-to-strong generalization: Due to the strong fit ability of strong models, +erroneous labels from weak supervisors may lead to overfitting in strong +models. In addition, simply filtering out incorrect labels may lead to a +degeneration in question quality, resulting in a weak generalization ability of +strong models on hard questions. To mitigate overfitting in weak-to-strong +generalization, we propose a two-stage framework that simultaneously improves +the quality of supervision signals and the quality of input questions. +Experimental results in three series of large language models and two +mathematical benchmarks demonstrate that our framework significantly improves +PGR compared to naive weak-to-strong generalization, even achieving up to 100\% +PGR on some models. + +
+
+
+
+
+ + ☆ Incorporating Surrogate Gradient Norm to Improve Offline Optimization + Techniques + + +
+ Offline optimization has recently emerged as an increasingly popular approach +to mitigate the prohibitively expensive cost of online experimentation. The key +idea is to learn a surrogate of the black-box function that underlines the +target experiment using a static (offline) dataset of its previous input-output +queries. Such an approach is, however, fraught with an out-of-distribution +issue where the learned surrogate becomes inaccurate outside the offline data +regimes. To mitigate this, existing offline optimizers have proposed numerous +conditioning techniques to prevent the learned surrogate from being too +erratic. Nonetheless, such conditioning strategies are often specific to +particular surrogate or search models, which might not generalize to a +different model choice. This motivates us to develop a model-agnostic approach +instead, which incorporates a notion of model sharpness into the training loss +of the surrogate as a regularizer. Our approach is supported by a new +theoretical analysis demonstrating that reducing surrogate sharpness on the +offline dataset provably reduces its generalized sharpness on unseen data. Our +analysis extends existing theories from bounding generalized prediction loss +(on unseen data) with loss sharpness to bounding the worst-case generalized +surrogate sharpness with its empirical estimate on training data, providing a +new perspective on sharpness regularization. Our extensive experimentation on a +diverse range of optimization tasks also shows that reducing surrogate +sharpness often leads to significant improvement, marking (up to) a noticeable +9.6% performance boost. Our code is publicly available at +https://github.com/cuong-dm/IGNITE + +
+
+
+
+
+ + ☆ ThrowBench: Benchmarking LLMs by Predicting Runtime Exceptions + + +
+ Modern Large Language Models (LLMs) have shown astounding capabilities of +code understanding and synthesis. In order to assess such capabilities, several +benchmarks have been devised (e.g., HumanEval). However, most benchmarks focus +on code synthesis from natural language instructions. Hence, such benchmarks do +not test for other forms of code understanding. Moreover, there have been +concerns about contamination and leakage. That is, benchmark problems (or +closely related problems) may appear in training set, strongly biasing +benchmark results. In this work we investigate whether large language models +can correctly predict runtime program behavior. To this end, we introduce +ThrowBench, a benchmark consisting of over 2,400 short user-written programs +written in four different programming languages. The majority of these programs +throw an exception during runtime (due to a bug). LLMs are asked to predict +whether a presented program throws an exception and, if so, which one. +Evaluating our benchmark on six state-of-the-art code LLMs we see modest +performance ranging from 19 to 38% (F1 score). Benchmarking a wider set of code +capabilities could improve the assessment of code LLMs and help identify weak +points in current models. Moreover, as ground-truth answers have been +determined through program execution, leakage is not a concern. We release +ThrowBench as well as all of our results together with this work. + +
+
+
+
+
+ + ☆ One-Shot Clustering for Federated Learning + + +
+ Federated Learning (FL) is a widespread and well adopted paradigm of +decentralized learning that allows training one model from multiple sources +without the need to directly transfer data between participating clients. Since +its inception in 2015, it has been divided into numerous sub-fields that deal +with application-specific issues, be it data heterogeneity or resource +allocation. One such sub-field, Clustered Federated Learning (CFL), is dealing +with the problem of clustering the population of clients into separate cohorts +to deliver personalized models. Although few remarkable works have been +published in this domain, the problem is still largely unexplored, as its basic +assumption and settings are slightly different from standard FL. In this work, +we present One-Shot Clustered Federated Learning (OCFL), a clustering-agnostic +algorithm that can automatically detect the earliest suitable moment for +clustering. Our algorithm is based on the computation of cosine similarity +between gradients of the clients and a temperature measure that detects when +the federated model starts to converge. We empirically evaluate our methodology +by testing various one-shot clustering algorithms for over thirty different +tasks on three benchmark datasets. Our experiments showcase the good +performance of our approach when used to perform CFL in an automated manner +without the need to adjust hyperparameters. + +
+
+
+
+
+ + ☆ Synthetic Data is an Elegant GIFT for Continual Vision-Language Models CVPR 2025 + + +
+ Pre-trained Vision-Language Models (VLMs) require Continual Learning (CL) to +efficiently update their knowledge and adapt to various downstream tasks +without retraining from scratch. However, for VLMs, in addition to the loss of +knowledge previously learned from downstream tasks, pre-training knowledge is +also corrupted during continual fine-tuning. This issue is exacerbated by the +unavailability of original pre-training data, leaving VLM's generalization +ability degrading. In this paper, we propose GIFT, a novel continual +fine-tuning approach that utilizes synthetic data to overcome catastrophic +forgetting in VLMs. Taking advantage of recent advances in text-to-image +synthesis, we employ a pre-trained diffusion model to recreate both +pre-training and learned downstream task data. In this way, the VLM can revisit +previous knowledge through distillation on matching diffusion-generated images +and corresponding text prompts. Leveraging the broad distribution and high +alignment between synthetic image-text pairs in VLM's feature space, we propose +a contrastive distillation loss along with an image-text alignment constraint. +To further combat in-distribution overfitting and enhance distillation +performance with limited amount of generated data, we incorporate adaptive +weight consolidation, utilizing Fisher information from these synthetic +image-text pairs and achieving a better stability-plasticity balance. Extensive +experiments demonstrate that our method consistently outperforms previous +state-of-the-art approaches across various settings. + +
+
+ comment: This work is accepted by CVPR 2025. Modifications may be performed +
+
+
+
+
+ + ☆ Quantum-Inspired Reinforcement Learning in the Presence of Epistemic + Ambivalence + + +
+ The complexity of online decision-making under uncertainty stems from the +requirement of finding a balance between exploiting known strategies and +exploring new possibilities. Naturally, the uncertainty type plays a crucial +role in developing decision-making strategies that manage complexity +effectively. In this paper, we focus on a specific form of uncertainty known as +epistemic ambivalence (EA), which emerges from conflicting pieces of evidence +or contradictory experiences. It creates a delicate interplay between +uncertainty and confidence, distinguishing it from epistemic uncertainty that +typically diminishes with new information. Indeed, ambivalence can persist even +after additional knowledge is acquired. To address this phenomenon, we propose +a novel framework, called the epistemically ambivalent Markov decision process +(EA-MDP), aiming to understand and control EA in decision-making processes. +This framework incorporates the concept of a quantum state from the quantum +mechanics formalism, and its core is to assess the probability and reward of +every possible outcome. We calculate the reward function using quantum +measurement techniques and prove the existence of an optimal policy and an +optimal value function in the EA-MDP framework. We also propose the +EA-epsilon-greedy Q-learning algorithm. To evaluate the impact of EA on +decision-making and the expedience of our framework, we study two distinct +experimental setups, namely the two-state problem and the lattice problem. Our +results show that using our methods, the agent converges to the optimal policy +in the presence of EA. + +
+
+
+
+
+ + ☆ FUSE: First-Order and Second-Order Unified SynthEsis in Stochastic + Optimization + + +
+ Stochastic optimization methods have actively been playing a critical role in +modern machine learning algorithms to deliver decent performance. While +numerous works have proposed and developed diverse approaches, first-order and +second-order methods are in entirely different situations. The former is +significantly pivotal and dominating in emerging deep learning but only leads +convergence to a stationary point. However, second-order methods are less +popular due to their computational intensity in large-dimensional problems. +This paper presents a novel method that leverages both the first-order and +second-order methods in a unified algorithmic framework, termed FUSE, from +which a practical version (PV) is derived accordingly. FUSE-PV stands as a +simple yet efficient optimization method involving a switch-over between first +and second orders. Additionally, we develop different criteria that determine +when to switch. FUSE-PV has provably shown a smaller computational complexity +than SGD and Adam. To validate our proposed scheme, we present an ablation +study on several simple test functions and show a comparison with baselines for +benchmark datasets. + +
+
+ comment: 6 pages, 7 figures +
+
+
+
+
+ + ☆ Geometric Re-Analysis of Classical MDP Solving Algorithms + + +
+ We build on a recently introduced geometric interpretation of Markov Decision +Processes (MDPs) to analyze classical MDP-solving algorithms: Value Iteration +(VI) and Policy Iteration (PI). First, we develop a geometry-based analytical +apparatus, including a transformation that modifies the discount factor +$\gamma$, to improve convergence guarantees for these algorithms in several +settings. In particular, one of our results identifies a rotation component in +the VI method, and as a consequence shows that when a Markov Reward Process +(MRP) induced by the optimal policy is irreducible and aperiodic, the +asymptotic convergence rate of value iteration is strictly smaller than +$\gamma$. + +
+
+
+
+
+ + ♻ ☆ When Can You Get Away with Low Memory Adam? + + +
+ Adam is the go-to optimizer for training modern machine learning models, but +it requires additional memory to maintain the moving averages of the gradients +and their squares. While various low-memory optimizers have been proposed that +sometimes match the performance of Adam, their lack of reliability has left +Adam as the default choice. In this work, we apply a simple layer-wise +Signal-to-Noise Ratio (SNR) analysis to quantify when second-moment tensors can +be effectively replaced by their means across different dimensions. Our SNR +analysis reveals how architecture, training hyperparameters, and dataset +properties impact compressibility along Adam's trajectory, naturally leading to +$\textit{SlimAdam}$, a memory-efficient Adam variant. $\textit{SlimAdam}$ +compresses the second moments along dimensions with high SNR when feasible, and +leaves when compression would be detrimental. Through experiments across a +diverse set of architectures and training scenarios, we show that +$\textit{SlimAdam}$ matches Adam's performance and stability while saving up to +$98\%$ of total second moments. Code for $\textit{SlimAdam}$ is available at +https://github.com/dayal-kalra/low-memory-adam. + +
+
+ comment: Acknowledgement updates and minor writing edits +
+
+
+
+
+ + ♻ ☆ RAAD-LLM: Adaptive Anomaly Detection Using LLMs and RAG Integration + + +
+ Anomaly detection in complex industrial environments poses unique challenges, +particularly in contexts characterized by data sparsity and evolving +operational conditions. Predictive maintenance (PdM) in such settings demands +methodologies that are adaptive, transferable, and capable of integrating +domain-specific knowledge. In this paper, we present RAAD-LLM, a novel +framework for adaptive anomaly detection, leveraging large language models +(LLMs) integrated with Retrieval-Augmented Generation (RAG). This approach +addresses the aforementioned PdM challenges. By effectively utilizing +domain-specific knowledge, RAAD-LLM enhances the detection of anomalies in time +series data without requiring fine-tuning on specific datasets. The framework's +adaptability mechanism enables it to adjust its understanding of normal +operating conditions dynamically, thus increasing detection accuracy. We +validate this methodology through a real-world application for a plastics +manufacturing plant and the Skoltech Anomaly Benchmark (SKAB). Results show +significant improvements over our previous model with an accuracy increase from +70.7% to 89.1% on the real-world dataset. By allowing for the enriching of +input series data with semantics, RAAD-LLM incorporates multimodal capabilities +that facilitate more collaborative decision-making between the model and plant +operators. Overall, our findings support RAAD-LLM's ability to revolutionize +anomaly detection methodologies in PdM, potentially leading to a paradigm shift +in how anomaly detection is implemented across various industries. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2411.00914 +
+
+
+
+
+ + ♻ ☆ The Last Iterate Advantage: Empirical Auditing and Principled Heuristic + Analysis of Differentially Private SGD ICLR 2025 + + +
+ We propose a simple heuristic privacy analysis of noisy clipped stochastic +gradient descent (DP-SGD) in the setting where only the last iterate is +released and the intermediate iterates remain hidden. Namely, our heuristic +assumes a linear structure for the model. + We show experimentally that our heuristic is predictive of the outcome of +privacy auditing applied to various training procedures. Thus it can be used +prior to training as a rough estimate of the final privacy leakage. We also +probe the limitations of our heuristic by providing some artificial +counterexamples where it underestimates the privacy leakage. + The standard composition-based privacy analysis of DP-SGD effectively assumes +that the adversary has access to all intermediate iterates, which is often +unrealistic. However, this analysis remains the state of the art in practice. +While our heuristic does not replace a rigorous privacy analysis, it +illustrates the large gap between the best theoretical upper bounds and the +privacy auditing lower bounds and sets a target for further work to improve the +theoretical privacy analyses. We also empirically support our heuristic and +show existing privacy auditing attacks are bounded by our heuristic analysis in +both vision and language tasks. + +
+
+ comment: ICLR 2025 camera-ready version +
+
+
+
+
+ + ♻ ☆ Some Targets Are Harder to Identify than Others: Quantifying the + Target-dependent Membership Leakage AISTATS 2025 + + +
+ In a Membership Inference (MI) game, an attacker tries to infer whether a +target point was included or not in the input of an algorithm. Existing works +show that some target points are easier to identify, while others are harder. +This paper explains the target-dependent hardness of membership attacks by +studying the powers of the optimal attacks in a fixed-target MI game. We +characterise the optimal advantage and trade-off functions of attacks against +the empirical mean in terms of the Mahalanobis distance between the target +point and the data-generating distribution. We further derive the impacts of +two privacy defences, i.e. adding Gaussian noise and sub-sampling, and that of +target misspecification on optimal attacks. As by-products of our novel +analysis of the Likelihood Ratio (LR) test, we provide a new covariance attack +which generalises and improves the scalar product attack. Also, we propose a +new optimal canary-choosing strategy for auditing privacy in the white-box +federated learning setting. Our experiments validate that the Mahalanobis score +explains the hardness of fixed-target MI games. + +
+
+ comment: Appears in AISTATS 2025 (Oral) +
+
+
+
+
+ + ♻ ☆ AdaptBot: Combining LLM with Knowledge Graphs and Human Input for + Generic-to-Specific Task Decomposition and Knowledge Refinement ICRA + + +
+ An embodied agent assisting humans is often asked to complete new tasks, and +there may not be sufficient time or labeled examples to train the agent to +perform these new tasks. Large Language Models (LLMs) trained on considerable +knowledge across many domains can be used to predict a sequence of abstract +actions for completing such tasks, although the agent may not be able to +execute this sequence due to task-, agent-, or domain-specific constraints. Our +framework addresses these challenges by leveraging the generic predictions +provided by LLM and the prior domain knowledge encoded in a Knowledge Graph +(KG), enabling an agent to quickly adapt to new tasks. The robot also solicits +and uses human input as needed to refine its existing knowledge. Based on +experimental evaluation in the context of cooking and cleaning tasks in +simulation domains, we demonstrate that the interplay between LLM, KG, and +human input leads to substantial performance gains compared with just using the +LLM. Project website{\S}: https://sssshivvvv.github.io/adaptbot/ + +
+
+ comment: Accepted to IEEE International Conference on Robotics and Automation + (ICRA) 2025 +
+
+
+
+
+ + ♻ ☆ Detecting Systematic Weaknesses in Vision Models along Predefined + Human-Understandable Dimensions + + +
+ Slice discovery methods (SDMs) are prominent algorithms for finding +systematic weaknesses in DNNs. They identify top-k semantically coherent +slices/subsets of data where a DNN-under-test has low performance. For being +directly useful, slices should be aligned with human-understandable and +relevant dimensions, which, for example, are defined by safety and domain +experts as part of the operational design domain (ODD). While SDMs can be +applied effectively on structured data, their application on image data is +complicated by the lack of semantic metadata. To address these issues, we +present an algorithm that combines foundation models for zero-shot image +classification to generate semantic metadata with methods for combinatorial +search to find systematic weaknesses in images. In contrast to existing +approaches, ours identifies weak slices that are in line with pre-defined +human-understandable dimensions. As the algorithm includes foundation models, +its intermediate and final results may not always be exact. Therefore, we +include an approach to address the impact of noisy metadata. We validate our +algorithm on both synthetic and real-world datasets, demonstrating its ability +to recover human-understandable systematic weaknesses. Furthermore, using our +approach, we identify systematic weaknesses of multiple pre-trained and +publicly available state-of-the-art computer vision DNNs. + +
+
+
+
+
+ + ♻ ☆ Back Home: A Machine Learning Approach to Seashell Classification and + Ecosystem Restoration + + +
+ In Costa Rica, an average of 5 tons of seashells are extracted from +ecosystems annually. Confiscated seashells, cannot be returned to their +ecosystems due to the lack of origin recognition. To address this issue, we +developed a convolutional neural network (CNN) specifically for seashell +identification. We built a dataset from scratch, consisting of approximately +19000 images from the Pacific and Caribbean coasts. Using this dataset, the +model achieved a classification accuracy exceeding 85%. The model has been +integrated into a user-friendly application, which has classified over 36,000 +seashells to date, delivering real-time results within 3 seconds per image. To +further enhance the system's accuracy, an anomaly detection mechanism was +incorporated to filter out irrelevant or anomalous inputs, ensuring only valid +seashell images are processed. + +
+
+
+
+
+ + ♻ ☆ Tutorial on amortized optimization + + +
+ Optimization is a ubiquitous modeling tool and is often deployed in settings +which repeatedly solve similar instances of the same problem. Amortized +optimization methods use learning to predict the solutions to problems in these +settings, exploiting the shared structure between similar problem instances. +These methods have been crucial in variational inference and reinforcement +learning and are capable of solving optimization problems many orders of +magnitudes times faster than traditional optimization methods that do not use +amortization. This tutorial presents an introduction to the amortized +optimization foundations behind these advancements and overviews their +applications in variational inference, sparse coding, gradient-based +meta-learning, control, reinforcement learning, convex optimization, optimal +transport, and deep equilibrium networks. The source code for this tutorial is +available at +https://github.com/facebookresearch/amortized-optimization-tutorial. + +
+
+ comment: Foundations and Trends in Machine Learning +
+
+
+
+
+ + ♻ ☆ A Simple and Effective Reinforcement Learning Method for Text-to-Image + Diffusion Fine-tuning + + +
+ Reinforcement learning (RL)-based fine-tuning has emerged as a powerful +approach for aligning diffusion models with black-box objectives. Proximal +policy optimization (PPO) is the most popular choice of method for policy +optimization. While effective in terms of performance, PPO is highly sensitive +to hyper-parameters and involves substantial computational overhead. REINFORCE, +on the other hand, mitigates some computational complexities such as high +memory overhead and sensitive hyper-parameter tuning, but has suboptimal +performance due to high-variance and sample inefficiency. While the variance of +the REINFORCE can be reduced by sampling multiple actions per input prompt and +using a baseline correction term, it still suffers from sample inefficiency. To +address these challenges, we systematically analyze the +efficiency-effectiveness trade-off between REINFORCE and PPO, and propose +leave-one-out PPO (LOOP), a novel RL for diffusion fine-tuning method. LOOP +combines variance reduction techniques from REINFORCE, such as sampling +multiple actions per input prompt and a baseline correction term, with the +robustness and sample efficiency of PPO via clipping and importance sampling. +Our results demonstrate that LOOP effectively improves diffusion models on +various black-box objectives, and achieves a better balance between +computational efficiency and performance. + +
+
+
+
+
+ + ♻ ☆ Human-Feedback Efficient Reinforcement Learning for Online Diffusion + Model Finetuning ICLR + + +
+ Controllable generation through Stable Diffusion (SD) fine-tuning aims to +improve fidelity, safety, and alignment with human guidance. Existing +reinforcement learning from human feedback methods usually rely on predefined +heuristic reward functions or pretrained reward models built on large-scale +datasets, limiting their applicability to scenarios where collecting such data +is costly or difficult. To effectively and efficiently utilize human feedback, +we develop a framework, HERO, which leverages online human feedback collected +on the fly during model learning. Specifically, HERO features two key +mechanisms: (1) Feedback-Aligned Representation Learning, an online training +method that captures human feedback and provides informative learning signals +for fine-tuning, and (2) Feedback-Guided Image Generation, which involves +generating images from SD's refined initialization samples, enabling faster +convergence towards the evaluator's intent. We demonstrate that HERO is 4x more +efficient in online feedback for body part anomaly correction compared to the +best existing method. Additionally, experiments show that HERO can effectively +handle tasks like reasoning, counting, personalization, and reducing NSFW +content with only 0.5K online feedback. + +
+
+ comment: Published in International Conference on Learning Representations + (ICLR) 2025 +
+
+
+
+
+ + ♻ ☆ A learning-based approach to stochastic optimal control under + reach-avoid constraint + + +
+ We develop a model-free approach to optimally control stochastic, Markovian +systems subject to a reach-avoid constraint. Specifically, the state trajectory +must remain within a safe set while reaching a target set within a finite time +horizon. Due to the time-dependent nature of these constraints, we show that, +in general, the optimal policy for this constrained stochastic control problem +is non-Markovian, which increases the computational complexity. To address this +challenge, we apply the state-augmentation technique from arXiv:2402.19360, +reformulating the problem as a constrained Markov decision process (CMDP) on an +extended state space. This transformation allows us to search for a Markovian +policy, avoiding the complexity of non-Markovian policies. To learn the optimal +policy without a system model, and using only trajectory data, we develop a +log-barrier policy gradient approach. We prove that under suitable assumptions, +the policy parameters converge to the optimal parameters, while ensuring that +the system trajectories satisfy the stochastic reach-avoid constraint with high +probability. + +
+
+
+
+
+ + ♻ ☆ Towards One Model for Classical Dimensionality Reduction: A + Probabilistic Perspective on UMAP and t-SNE + + +
+ This paper shows that dimensionality reduction methods such as UMAP and +t-SNE, can be approximately recast as MAP inference methods corresponding to a +model introduced in ProbDR, that describes the graph Laplacian (an estimate of +the data precision matrix) using a Wishart distribution, with a mean given by a +non-linear covariance function evaluated on the latents. This interpretation +offers deeper theoretical and semantic insights into such algorithms, by +showing that variances corresponding to these covariances are low (potentially +misspecified), and forging a connection to Gaussian process latent variable +models by showing that well-known kernels can be used to describe covariances +implied by graph Laplacians. We also introduce tools with which similar +dimensionality reduction methods can be studied. + +
+
+ comment: Updated preprint +
+
+
+
+
+ + ♻ ☆ Protein Large Language Models: A Comprehensive Survey + + +
+ Protein-specific large language models (Protein LLMs) are revolutionizing +protein science by enabling more efficient protein structure prediction, +function annotation, and design. While existing surveys focus on specific +aspects or applications, this work provides the first comprehensive overview of +Protein LLMs, covering their architectures, training datasets, evaluation +metrics, and diverse applications. Through a systematic analysis of over 100 +articles, we propose a structured taxonomy of state-of-the-art Protein LLMs, +analyze how they leverage large-scale protein sequence data for improved +accuracy, and explore their potential in advancing protein engineering and +biomedical research. Additionally, we discuss key challenges and future +directions, positioning Protein LLMs as essential tools for scientific +discovery in protein science. Resources are maintained at +https://github.com/Yijia-Xiao/Protein-LLM-Survey. + +
+
+ comment: 24 pages, 4 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Streaming Private Continual Counting via Binning + + +
+ In differential privacy, $\textit{continual observation}$ refers to problems +in which we wish to continuously release a function of a dataset that is +revealed one element at a time. The challenge is to maintain a good +approximation while keeping the combined output over all time steps +differentially private. In the special case of $\textit{continual counting}$ we +seek to approximate a sum of binary input elements. This problem has received +considerable attention lately, in part due to its relevance in implementations +of differentially private stochastic gradient descent. $\textit{Factorization +mechanisms}$ are the leading approach to continual counting, but the best such +mechanisms do not work well in $\textit{streaming}$ settings since they require +space proportional to the size of the input. In this paper, we present a simple +approach to approximating factorization mechanisms in low space via +$\textit{binning}$, where adjacent matrix entries with similar values are +changed to be identical in such a way that a matrix-vector product can be +maintained in sublinear space. Our approach has provable sublinear space +guarantees for a class of lower triangular matrices whose entries are +monotonically decreasing away from the diagonal. We show empirically that even +with very low space usage we are able to closely match, and sometimes surpass, +the performance of asymptotically optimal factorization mechanisms. Recently, +and independently of our work, Dvijotham et al. have also suggested an approach +to implementing factorization mechanisms in a streaming setting. Their work +differs from ours in several respects: It only addresses factorization into +$\textit{Toeplitz}$ matrices, only considers $\textit{maximum}$ error, and uses +a different technique based on rational function approximation that seems less +versatile than our binning approach. + +
+
+ comment: Accepted to SaTML 2025. Final version to appear on IEEE eXplore +
+
+
+
+
+ + ♻ ☆ $\texttt{SEM-CTRL}$: Semantically Controlled Decoding + + +
+ Ensuring both syntactic and semantic correctness in Large Language Model +(LLM) outputs remains a significant challenge, despite being critical for +real-world deployment. In this paper, we introduce $\texttt{SEM-CTRL}$, a +unified approach that enforces rich context-sensitive constraints and task- and +instance-specific semantics directly on an LLM decoder. Our approach integrates +token-level MCTS, which is guided by specific syntactic and semantic +constraints. The constraints over the desired outputs are expressed using +Answer Set Grammars -- a logic-based formalism that generalizes +context-sensitive grammars while incorporating background knowledge to +represent task-specific semantics. We show that our approach guarantees correct +completions for any off-the-shelf LLM without the need for fine-tuning. We +evaluate $\texttt{SEM-CTRL}$ on a range of tasks, including synthetic grammar +synthesis, combinatorial reasoning, and planning. Our results demonstrate that +$\texttt{SEM-CTRL}$ allows small pre-trained LLMs to efficiently outperform +larger variants and state-of-the-art reasoning models (e.g., o1-preview) while +simultaneously guaranteeing solution correctness. + +
+
+
+
+
+ + ♻ ☆ Golden Ratio Weighting Prevents Model Collapse + + +
+ Recent studies identified an intriguing phenomenon in recursive generative +model training known as model collapse, where models trained on data generated +by previous models exhibit severe performance degradation. Addressing this +issue and developing more effective training strategies have become central +challenges in generative model research. In this paper, we investigate this +phenomenon theoretically within a novel framework, where generative models are +iteratively trained on a combination of newly collected real data and synthetic +data from the previous training step. To develop an optimal training strategy +for integrating real and synthetic data, we evaluate the performance of a +weighted training scheme in various scenarios, including Gaussian distribution +estimation and linear regression. We theoretically characterize the impact of +the mixing proportion and weighting scheme of synthetic data on the final +model's performance. Our key finding is that, across different settings, the +optimal weighting scheme under different proportions of synthetic data +asymptotically follows a unified expression, revealing a fundamental trade-off +between leveraging synthetic data and generative model performance. Notably, in +some cases, the optimal weight assigned to real data corresponds to the +reciprocal of the golden ratio. Finally, we validate our theoretical results on +extensive simulated datasets and a real tabular dataset. + +
+
+
+
+
+ + ♻ ☆ Beyond Single Concept Vector: Modeling Concept Subspace in LLMs with + Gaussian Distribution ICLR 2025 + + +
+ Probing learned concepts in large language models (LLMs) is crucial for +understanding how semantic knowledge is encoded internally. Training linear +classifiers on probing tasks is a principle approach to denote the vector of a +certain concept in the representation space. However, the single vector +identified for a concept varies with both data and training, making it less +robust and weakening its effectiveness in real-world applications. To address +this challenge, we propose an approach to approximate the subspace representing +a specific concept. Built on linear probing classifiers, we extend the concept +vectors into Gaussian Concept Subspace (GCS). We demonstrate GCS's +effectiveness through measuring its faithfulness and plausibility across +multiple LLMs with different sizes and architectures. Additionally, we use +representation intervention tasks to showcase its efficacy in real-world +applications such as emotion steering. Experimental results indicate that GCS +concept vectors have the potential to balance steering performance and +maintaining the fluency in natural language generation tasks. + +
+
+ comment: Accepted by ICLR 2025 +
+
+
+
+
+ + ♻ ☆ An Analysis Framework for Understanding Deep Neural Networks Based on + Network Dynamics + + +
+ Advancing artificial intelligence demands a deeper understanding of the +mechanisms underlying deep learning. Here, we propose a straightforward +analysis framework based on the dynamics of learning models. Neurons are +categorized into two modes based on whether their transformation functions +preserve order. This categorization reveals how deep neural networks (DNNs) +maximize information extraction by rationally allocating the proportion of +neurons in different modes across deep layers. We further introduce the +attraction basins of the training samples in both the sample vector space and +the weight vector space to characterize the generalization ability of DNNs. +This framework allows us to identify optimal depth and width configurations, +providing a unified explanation for fundamental DNN behaviors such as the "flat +minima effect," "grokking," and double descent phenomena. Our analysis extends +to networks with depths up to 100 layers. + +
+
+ comment: 12 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Graph Neural Networks for Virtual Sensing in Complex Systems: Addressing + Heterogeneous Temporal Dynamics SP + + +
+ Real-time condition monitoring is crucial for the reliable and efficient +operation of complex systems. However, relying solely on physical sensors can +be limited due to their cost, placement constraints, or inability to directly +measure certain critical parameters. Virtual sensing addresses these +limitations by leveraging readily available sensor data and system knowledge to +estimate inaccessible parameters or infer system states. The increasing +complexity of industrial systems necessitates deployments of sensors with +diverse modalities to provide a comprehensive understanding of system states. +These sensors capture data at varying frequencies to monitor both rapid and +slowly varying system dynamics, as well as local and global state evolutions of +the systems. This leads to heterogeneous temporal dynamics, which, particularly +under varying operational end environmental conditions, pose a significant +challenge for accurate virtual sensing. To address this, we propose a +Heterogeneous Temporal Graph Neural Network (HTGNN) framework. HTGNN explicitly +models signals from diverse sensors and integrates operating conditions into +the model architecture. We evaluate HTGNN using two newly released datasets: a +bearing dataset with diverse load conditions for bearing load prediction and a +year-long simulated dataset for predicting bridge live loads. Our results +demonstrate that HTGNN significantly outperforms established baseline methods +in both tasks, particularly under highly varying operating conditions. These +results highlight HTGNN's potential as a robust and accurate virtual sensing +approach for complex systems, paving the way for improved monitoring, +predictive maintenance, and enhanced system performance. Our code and data are +available under https://github.com/EPFL-IMOS/htgnn. + +
+
+ comment: This paper extends our previous conference paper (Best Paper at + European Conference of the PHM Society 2024, + https://doi.org/10.36001/phme.2024.v8i1.3998). Accepted by Mechanical Systems + and Signal Processing (MSSP) +
+
+
+
+
+ + ♻ ☆ The FFT Strikes Back: An Efficient Alternative to Self-Attention + + +
+ Conventional self-attention mechanisms incur quadratic complexity, limiting +their scalability on long sequences. We introduce \textbf{FFTNet}, an adaptive +spectral filtering framework that leverages the Fast Fourier Transform (FFT) to +achieve global token mixing in $\mathcal{O}(n\log n)$ time. By transforming +inputs into the frequency domain, FFTNet exploits the orthogonality and energy +preservation guaranteed by Parseval's theorem to capture long-range +dependencies efficiently. Our main theoretical contributions are 1) an adaptive +spectral filter, 2) combining local windowing with a global FFT branch, and 3) +rich nonlinearity introduction in both the frequency and token domains. +Experiments on the Long Range Arena and ImageNet benchmarks validate our +theoretical insights and demonstrate superior performance over fixed Fourier +and standard attention models. + +
+
+
+
+
+ + ♻ ☆ X-Boundary: Establishing Exact Safety Boundary to Shield LLMs from + Multi-Turn Jailbreaks without Compromising Usability + + +
+ Despite the rapid development of safety alignment techniques for LLMs, +defending against multi-turn jailbreaks is still a challenging task. In this +paper, we conduct a comprehensive comparison, revealing that some existing +defense methods can improve the robustness of LLMs against multi-turn +jailbreaks but compromise usability, i.e., reducing general capabilities or +causing the over-refusal problem. From the perspective of mechanism +interpretability of LLMs, we discover that these methods fail to establish a +boundary that exactly distinguishes safe and harmful feature representations. +Therefore, boundary-safe representations close to harmful representations are +inevitably disrupted, leading to a decline in usability. To address this issue, +we propose X-Boundary to push harmful representations away from boundary-safe +representations and obtain an exact distinction boundary. In this way, harmful +representations can be precisely erased without disrupting safe ones. +Experimental results show that X-Boundary achieves state-of-the-art defense +performance against multi-turn jailbreaks, while reducing the over-refusal rate +by about 20% and maintaining nearly complete general capability. Furthermore, +we theoretically prove and empirically verify that X-Boundary can accelerate +the convergence process during training. Please see our code at: +https://github.com/AI45Lab/X-Boundary. + +
+
+
+
+
+ + ♻ ☆ Chunking the Critic: A Transformer-based Soft Actor-Critic with N-Step + Returns + + +
+ Soft Actor-Critic (SAC) critically depends on its critic network, which +typically evaluates a single state-action pair to guide policy updates. Using +N-step returns is a common practice to reduce the bias in the target values of +the critic. However, using N-step returns can again introduce high variance and +necessitates importance sampling, often destabilizing training. Recent +algorithms have also explored action chunking-such as direct action repetition +and movement primitives-to enhance exploration. In this paper, we propose a +Transformer-based Critic Network for SAC that integrates the N-returns +framework in a stable and efficient manner. Unlike approaches that perform +chunking in the actor network, we feed chunked actions into the critic network +to explore potential performance gains. Our architecture leverages the +Transformer's ability to process sequential information, facilitating more +robust value estimation. Empirical results show that this method not only +achieves efficient, stable training but also excels in sparse +reward/multi-phase environments-traditionally a challenge for step-based +methods. These findings underscore the promise of combining Transformer-based +critics with N-returns to advance reinforcement learning performance + +
+
+
+
+
+ + ♻ ☆ Learning finitely correlated states: stability of the spectral + reconstruction + + +
+ Matrix product operators allow efficient descriptions (or realizations) of +states on a 1D lattice. We consider the task of learning a realization of +minimal dimension from copies of an unknown state, such that the resulting +operator is close to the density matrix in trace norm. For finitely correlated +translation-invariant states on an infinite chain, a realization of minimal +dimension can be exactly reconstructed via linear algebra operations from the +marginals of a size depending on the representation dimension. We establish a +bound on the trace norm error for an algorithm that estimates a candidate +realization from estimates of these marginals and outputs a matrix product +operator, estimating the state of a chain of arbitrary length $t$. This bound +allows us to establish an $O(t^2)$ upper bound on the sample complexity of the +learning task, with an explicit dependence on the site dimension, realization +dimension and spectral properties of a certain map constructed from the state. +A refined error bound can be proven for $C^*$-finitely correlated states, which +have an operational interpretation in terms of sequential quantum channels +applied to the memory system. We can also obtain an analogous error bound for a +class of matrix product density operators on a finite chain reconstructible by +local marginals. In this case, a linear number of marginals must be estimated, +obtaining a sample complexity of $\tilde{O}(t^3)$. The learning algorithm also +works for states that are sufficiently close to a finitely correlated state, +with the potential of providing competitive algorithms for other interesting +families of states. + +
+
+ comment: 42 pages, 7 figures. Manuscript restructured, with minor corrections + and clarifications +
+
+
+
+
+ + ♻ ☆ On the Challenges and Opportunities in Generative AI + + +
+ The field of deep generative modeling has grown rapidly in the last few +years. With the availability of massive amounts of training data coupled with +advances in scalable unsupervised learning paradigms, recent large-scale +generative models show tremendous promise in synthesizing high-resolution +images and text, as well as structured data such as videos and molecules. +However, we argue that current large-scale generative AI models exhibit several +fundamental shortcomings that hinder their widespread adoption across domains. +In this work, our objective is to identify these issues and highlight key +unresolved challenges in modern generative AI paradigms that should be +addressed to further enhance their capabilities, versatility, and reliability. +By identifying these challenges, we aim to provide researchers with insights +for exploring fruitful research directions, thus fostering the development of +more robust and accessible generative AI solutions. + +
+
+
+
+
+ + ♻ ☆ Gumbel Counterfactual Generation From Language Models ICLR 2025 + + +
+ Understanding and manipulating the causal generation mechanisms in language +models is essential for controlling their behavior. Previous work has primarily +relied on techniques such as representation surgery -- e.g., model ablations or +manipulation of linear subspaces tied to specific concepts -- to +\emph{intervene} on these models. To understand the impact of interventions +precisely, it is useful to examine \emph{counterfactuals} -- e.g., how a given +sentence would have appeared had it been generated by the model following a +specific intervention. We highlight that counterfactual reasoning is +conceptually distinct from interventions, as articulated in Pearl's causal +hierarchy. Based on this observation, we propose a framework for generating +true string counterfactuals by reformulating language models as a structural +equation model using the Gumbel-max trick, which we called Gumbel +counterfactual generation. This reformulation allows us to model the joint +distribution over original strings and their counterfactuals resulting from the +same instantiation of the sampling noise. We develop an algorithm based on +hindsight Gumbel sampling that allows us to infer the latent noise variables +and generate counterfactuals of observed strings. Our experiments demonstrate +that the approach produces meaningful counterfactuals while at the same time +showing that commonly used intervention techniques have considerable undesired +side effects. + +
+
+ comment: Accepted in ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Efficient Diversity-Preserving Diffusion Alignment via Gradient-Informed + GFlowNets ICLR 2025 + + +
+ While one commonly trains large diffusion models by collecting datasets on +target downstream tasks, it is often desired to align and finetune pretrained +diffusion models with some reward functions that are either designed by experts +or learned from small-scale datasets. Existing post-training methods for reward +finetuning of diffusion models typically suffer from lack of diversity in +generated samples, lack of prior preservation, and/or slow convergence in +finetuning. Inspired by recent successes in generative flow networks +(GFlowNets), a class of probabilistic models that sample with the unnormalized +density of a reward function, we propose a novel GFlowNet method dubbed +Nabla-GFlowNet (abbreviated as \methodname), the first GFlowNet method that +leverages the rich signal in reward gradients, together with an objective +called \graddb plus its variant \resgraddb designed for prior-preserving +diffusion finetuning. We show that our proposed method achieves fast yet +diversity- and prior-preserving finetuning of Stable Diffusion, a large-scale +text-conditioned image diffusion model, on different realistic reward +functions. + +
+
+ comment: Technical Report (35 pages, 31 figures), Accepted at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Procedural Knowledge in Pretraining Drives Reasoning in Large Language + Models ICLR 2025 + + +
+ The capabilities and limitations of Large Language Models have been sketched +out in great detail in recent years, providing an intriguing yet conflicting +picture. On the one hand, LLMs demonstrate a general ability to solve problems. +On the other hand, they show surprising reasoning gaps when compared to humans, +casting doubt on the robustness of their generalisation strategies. The sheer +volume of data used in the design of LLMs has precluded us from applying the +method traditionally used to measure generalisation: train-test set separation. +To overcome this, we study what kind of generalisation strategies LLMs employ +when performing reasoning tasks by investigating the pretraining data they rely +on. For two models of different sizes (7B and 35B) and 2.5B of their +pretraining tokens, we identify what documents influence the model outputs for +three simple mathematical reasoning tasks and contrast this to the data that +are influential for answering factual questions. We find that, while the models +rely on mostly distinct sets of data for each factual question, a document +often has a similar influence across different reasoning questions within the +same task, indicating the presence of procedural knowledge. We further find +that the answers to factual questions often show up in the most influential +data. However, for reasoning questions the answers usually do not show up as +highly influential, nor do the answers to the intermediate reasoning steps. +When we characterise the top ranked documents for the reasoning questions +qualitatively, we confirm that the influential documents often contain +procedural knowledge, like demonstrating how to obtain a solution using +formulae or code. Our findings indicate that the approach to reasoning the +models use is unlike retrieval, and more like a generalisable strategy that +synthesises procedural knowledge from documents doing a similar form of +reasoning. + +
+
+ comment: Published at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Learning from negative feedback, or positive feedback or both + + +
+ Existing preference optimization methods often assume scenarios where paired +preference feedback (preferred/positive vs. dis-preferred/negative examples) is +available. This requirement limits their applicability in scenarios where only +unpaired feedback--for example, either positive or negative--is available. To +address this, we introduce a novel approach that decouples learning from +positive and negative feedback. This decoupling enables control over the +influence of each feedback type and, importantly, allows learning even when +only one feedback type is present. A key contribution is demonstrating stable +learning from negative feedback alone, a capability not well-addressed by +current methods. Our approach builds upon the probabilistic framework +introduced in (Dayan and Hinton, 1997), which uses expectation-maximization +(EM) to directly optimize the probability of positive outcomes (as opposed to +classic expected reward maximization). We address a key limitation in current +EM-based methods: they solely maximize the likelihood of positive examples, +while neglecting negative ones. We show how to extend EM algorithms to +explicitly incorporate negative examples, leading to a theoretically grounded +algorithm that offers an intuitive and versatile way to learn from both +positive and negative feedback. We evaluate our approach for training language +models based on human feedback as well as training policies for sequential +decision-making problems, where learned value functions are available. + +
+
+
+
+
+ + ♻ ☆ MMGDreamer: Mixed-Modality Graph for Geometry-Controllable 3D Indoor + Scene Generation AAAI 2025 + + +
+ Controllable 3D scene generation has extensive applications in virtual +reality and interior design, where the generated scenes should exhibit high +levels of realism and controllability in terms of geometry. Scene graphs +provide a suitable data representation that facilitates these applications. +However, current graph-based methods for scene generation are constrained to +text-based inputs and exhibit insufficient adaptability to flexible user +inputs, hindering the ability to precisely control object geometry. To address +this issue, we propose MMGDreamer, a dual-branch diffusion model for scene +generation that incorporates a novel Mixed-Modality Graph, visual enhancement +module, and relation predictor. The mixed-modality graph allows object nodes to +integrate textual and visual modalities, with optional relationships between +nodes. It enhances adaptability to flexible user inputs and enables meticulous +control over the geometry of objects in the generated scenes. The visual +enhancement module enriches the visual fidelity of text-only nodes by +constructing visual representations using text embeddings. Furthermore, our +relation predictor leverages node representations to infer absent relationships +between nodes, resulting in more coherent scene layouts. Extensive experimental +results demonstrate that MMGDreamer exhibits superior control of object +geometry, achieving state-of-the-art scene generation performance. Project +page: https://yangzhifeio.github.io/project/MMGDreamer. + +
+
+ comment: Accepted by AAAI 2025 Main Track +
+
+
+
+
+ + ♻ ☆ Magnetic Field Data Calibration with Transformer Model Using Physical + Constraints: A Scalable Method for Satellite Missions, Illustrated by + Tianwen-1 + + +
+ This study introduces a novel approach that integrates the magnetic field +data correction from the Tianwen-1 Mars mission with a neural network +architecture constrained by physical principles derived from Maxwell's equation +equations. By employing a Transformer based model capable of efficiently +handling sequential data, the method corrects measurement anomalies caused by +satellite dynamics, instrument interference, and environmental noise. As a +result, it significantly improves both the accuracy and the physical +consistency of the calibrated data. Compared to traditional methods that +require long data segments and manual intervention often taking weeks or even +months to complete this new approach can finish calibration in just minutes to +hours, and predictions are made within seconds. This innovation not only +accelerates the process of space weather modeling and planetary magnetospheric +studies but also provides a robust framework for future planetary exploration +and solar wind interaction research. + +
+
+
+
+
+ + ♻ ☆ MobileViM: A Light-weight and Dimension-independent Vision Mamba for 3D + Medical Image Analysis + + +
+ Efficient evaluation of three-dimensional (3D) medical images is crucial for +diagnostic and therapeutic practices in healthcare. Recent years have seen a +substantial uptake in applying deep learning and computer vision to analyse and +interpret medical images. Traditional approaches, such as convolutional neural +networks (CNNs) and vision transformers (ViTs), face significant computational +challenges, prompting the need for architectural advancements. Recent efforts +have led to the introduction of novel architectures like the ``Mamba'' model as +alternative solutions to traditional CNNs or ViTs. The Mamba model excels in +the linear processing of one-dimensional data with low computational demands. +However, Mamba's potential for 3D medical image analysis remains underexplored +and could face significant computational challenges as the dimension increases. +This manuscript presents MobileViM, a streamlined architecture for efficient +segmentation of 3D medical images. In the MobileViM network, we invent a new +dimension-independent mechanism and a dual-direction traversing approach to +incorporate with a vision-Mamba-based framework. MobileViM also features a +cross-scale bridging technique to improve efficiency and accuracy across +various medical imaging modalities. With these enhancements, MobileViM achieves +segmentation speeds exceeding 90 frames per second (FPS) on a single graphics +processing unit (i.e., NVIDIA RTX 4090). This performance is over 24 FPS faster +than the state-of-the-art deep learning models for processing 3D images with +the same computational resources. In addition, experimental evaluations +demonstrate that MobileViM delivers superior performance, with Dice similarity +scores reaching 92.72%, 86.69%, 80.46%, and 77.43% for PENGWIN, BraTS2024, +ATLAS, and Toothfairy2 datasets, respectively, which significantly surpasses +existing models. + +
+
+ comment: The corresponding author disagrees with the manuscript submitted to + arXiv +
+
+
+
+
+ + ♻ ☆ Data Poisoning Attacks to Locally Differentially Private Range Query + Protocols + + +
+ Local Differential Privacy (LDP) has been widely adopted to protect user +privacy in decentralized data collection. However, recent studies have revealed +that LDP protocols are vulnerable to data poisoning attacks, where malicious +users manipulate their reported data to distort aggregated results. In this +work, we present the first study on data poisoning attacks targeting LDP range +query protocols, focusing on both tree-based and grid-based approaches. We +identify three key challenges in executing such attacks, including crafting +consistent and effective fake data, maintaining data consistency across levels +or grids, and preventing server detection. To address the first two challenges, +we propose novel attack methods that are provably optimal, including a +tree-based attack and a grid-based attack, designed to manipulate range query +results with high effectiveness. \textbf{Our key finding is that the common +post-processing procedure, Norm-Sub, in LDP range query protocols can help the +attacker massively amplify their attack effectiveness.} In addition, we study a +potential countermeasure, but also propose an adaptive attack capable of +evading this defense to address the third challenge. We evaluate our methods +through theoretical analysis and extensive experiments on synthetic and +real-world datasets. Our results show that the proposed attacks can +significantly amplify estimations for arbitrary range queries by manipulating a +small fraction of users, providing 5-10x more influence than a normal user to +the estimation. + +
+
+
+
+
+ + ♻ ☆ An Efficient Learning Method to Connect Observables + + +
+ Constructing fast and accurate surrogate models is a key ingredient for +making robust predictions in many topics. We introduce a new model, the +Multiparameter Eigenvalue Problem (MEP) emulator. The new method connects +emulators and can make predictions directly from observables to observables. We +present that the MEP emulator can be trained with data from Eigenvector +Continuation (EC) and Parametric Matrix Model (PMM) emulators. A simple +simulation on a one-dimensional lattice confirms the performance of the MEP +emulator. Using $^{28}$O as an example, we also demonstrate that the predictive +probability distribution of the target observables can be easily obtained +through the new emulator. + +
+
+ comment: 5+2 pages, 4 figures, updated acknowledgment +
+
+
+
+
+ + ♻ ☆ Which Frequencies do CNNs Need? Emergent Bottleneck Structure in Feature + Learning + + +
+ We describe the emergence of a Convolution Bottleneck (CBN) structure in +CNNs, where the network uses its first few layers to transform the input +representation into a representation that is supported only along a few +frequencies and channels, before using the last few layers to map back to the +outputs. We define the CBN rank, which describes the number and type of +frequencies that are kept inside the bottleneck, and partially prove that the +parameter norm required to represent a function $f$ scales as depth times the +CBN rank $f$. We also show that the parameter norm depends at next order on the +regularity of $f$. We show that any network with almost optimal parameter norm +will exhibit a CBN structure in both the weights and - under the assumption +that the network is stable under large learning rate - the activations, which +motivates the common practice of down-sampling; and we verify that the CBN +results still hold with down-sampling. Finally we use the CBN structure to +interpret the functions learned by CNNs on a number of tasks. + +
+
+
+
+
+ + ♻ ☆ Hamiltonian Mechanics of Feature Learning: Bottleneck Structure in Leaky + ResNets + + +
+ We study Leaky ResNets, which interpolate between ResNets and Fully-Connected +nets depending on an 'effective depth' hyper-parameter $\tilde{L}$. In the +infinite depth limit, we study 'representation geodesics' $A_{p}$: continuous +paths in representation space (similar to NeuralODEs) from input $p=0$ to +output $p=1$ that minimize the parameter norm of the network. We give a +Lagrangian and Hamiltonian reformulation, which highlight the importance of two +terms: a kinetic energy which favors small layer derivatives +$\partial_{p}A_{p}$ and a potential energy that favors low-dimensional +representations, as measured by the 'Cost of Identity'. The balance between +these two forces offers an intuitive understanding of feature learning in +ResNets. We leverage this intuition to explain the emergence of a bottleneck +structure, as observed in previous work: for large $\tilde{L}$ the potential +energy dominates and leads to a separation of timescales, where the +representation jumps rapidly from the high dimensional inputs to a +low-dimensional representation, move slowly inside the space of low-dimensional +representations, before jumping back to the potentially high-dimensional +outputs. Inspired by this phenomenon, we train with an adaptive layer step-size +to adapt to the separation of timescales. + +
+
+
+
+
+ + ♻ ☆ How DNNs break the Curse of Dimensionality: Compositionality and + Symmetry Learning + + +
+ We show that deep neural networks (DNNs) can efficiently learn any +composition of functions with bounded $F_{1}$-norm, which allows DNNs to break +the curse of dimensionality in ways that shallow networks cannot. More +specifically, we derive a generalization bound that combines a covering number +argument for compositionality, and the $F_{1}$-norm (or the related Barron +norm) for large width adaptivity. We show that the global minimizer of the +regularized loss of DNNs can fit for example the composition of two functions +$f^{*}=h\circ g$ from a small number of observations, assuming $g$ is +smooth/regular and reduces the dimensionality (e.g. $g$ could be the quotient +map of the symmetries of $f^{*}$), so that $h$ can be learned in spite of its +low regularity. The measures of regularity we consider is the Sobolev norm with +different levels of differentiability, which is well adapted to the $F_{1}$ +norm. We compute scaling laws empirically and observe phase transitions +depending on whether $g$ or $h$ is harder to learn, as predicted by our theory. + +
+
+
+
+
+ + ♻ ☆ CATCH: Channel-Aware multivariate Time Series Anomaly Detection via + Frequency Patching ICLR 2025 + + +
+ Anomaly detection in multivariate time series is challenging as heterogeneous +subsequence anomalies may occur. Reconstruction-based methods, which focus on +learning normal patterns in the frequency domain to detect diverse abnormal +subsequences, achieve promising results, while still falling short on capturing +fine-grained frequency characteristics and channel correlations. To contend +with the limitations, we introduce CATCH, a framework based on frequency +patching. We propose to patchify the frequency domain into frequency bands, +which enhances its ability to capture fine-grained frequency characteristics. +To perceive appropriate channel correlations, we propose a Channel Fusion +Module (CFM), which features a patch-wise mask generator and a masked-attention +mechanism. Driven by a bi-level multi-objective optimization algorithm, the CFM +is encouraged to iteratively discover appropriate patch-wise channel +correlations, and to cluster relevant channels while isolating adverse effects +from irrelevant channels. Extensive experiments on 10 real-world datasets and +12 synthetic datasets demonstrate that CATCH achieves state-of-the-art +performance. We make our code and datasets available at +https://github.com/decisionintelligence/CATCH. + +
+
+ comment: Accepted by ICLR 2025 +
+
+
+
+
+ + ♻ ☆ AfroBench: How Good are Large Language Models on African Languages? + + +
+ Large-scale multilingual evaluations, such as MEGA, often include only a +handful of African languages due to the scarcity of high-quality evaluation +data and the limited discoverability of existing African datasets. This lack of +representation hinders comprehensive LLM evaluation across a diverse range of +languages and tasks. To address these challenges, we introduce AfroBench -- a +multi-task benchmark for evaluating the performance of LLMs across 64 African +languages, 15 tasks and 22 datasets. AfroBench consists of nine natural +language understanding datasets, six text generation datasets, six knowledge +and question answering tasks, and one mathematical reasoning task. We present +results comparing the performance of prompting LLMs to fine-tuned baselines +based on BERT and T5-style models. Our results suggest large gaps in +performance between high-resource languages, such as English, and African +languages across most tasks; but performance also varies based on the +availability of monolingual data resources. Our findings confirm that +performance on African languages continues to remain a hurdle for current LLMs, +underscoring the need for additional efforts to close this gap. + https://mcgill-nlp.github.io/AfroBench/ + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Robust Amortized Bayesian Inference with Self-Consistency Losses on + Unlabeled Data + + +
+ Neural amortized Bayesian inference (ABI) can solve probabilistic inverse +problems orders of magnitude faster than classical methods. However, neural ABI +is not yet sufficiently robust for widespread and safe applicability. In +particular, when performing inference on observations outside of the scope of +the simulated data seen during training, for example, because of model +misspecification, the posterior approximations are likely to become highly +biased. Due to the bad pre-asymptotic behavior of current neural posterior +estimators in the out-of-simulation regime, the resulting estimation biases +cannot be fixed in acceptable time by just simulating more training data. In +this proof-of-concept paper, we propose a semi-supervised approach that enables +training not only on (labeled) simulated data generated from the model, but +also on unlabeled data originating from any source, including real-world data. +To achieve the latter, we exploit Bayesian self-consistency properties that can +be transformed into strictly proper losses without requiring knowledge of true +parameter values, that is, without requiring data labels. The results of our +initial experiments show remarkable improvements in the robustness of ABI on +out-of-simulation data. Even if the observed data is far away from both labeled +and unlabeled training data, inference remains highly accurate. If our findings +also generalize to other scenarios and model classes, we believe that our new +method represents a major breakthrough in neural ABI. + +
+
+ comment: added acknowledgements +
+
+
+
+
+ + ♻ ☆ Enhancing Vietnamese VQA through Curriculum Learning on Raw and + Augmented Text Representations AAAI-25 + + +
+ Visual Question Answering (VQA) is a multimodal task requiring reasoning +across textual and visual inputs, which becomes particularly challenging in +low-resource languages like Vietnamese due to linguistic variability and the +lack of high-quality datasets. Traditional methods often rely heavily on +extensive annotated datasets, computationally expensive pipelines, and large +pre-trained models, specifically in the domain of Vietnamese VQA, limiting +their applicability in such scenarios. To address these limitations, we propose +a training framework that combines a paraphrase-based feature augmentation +module with a dynamic curriculum learning strategy. Explicitly, augmented +samples are considered "easy" while raw samples are regarded as "hard". The +framework then utilizes a mechanism that dynamically adjusts the ratio of easy +to hard samples during training, progressively modifying the same dataset to +increase its difficulty level. By enabling gradual adaptation to task +complexity, this approach helps the Vietnamese VQA model generalize well, thus +improving overall performance. Experimental results show consistent +improvements on the OpenViVQA dataset and mixed outcomes on the ViVQA dataset, +highlighting both the potential and challenges of our approach in advancing VQA +for Vietnamese language. + +
+
+ comment: 10 pages, 3 figures, AAAI-25 Workshop on Document Understanding and + Intelligence +
+
+
+
+
+ + ♻ ☆ Detecting new obfuscated malware variants: A lightweight and + interpretable machine learning approach + + +
+ Machine learning has been successfully applied in developing malware +detection systems, with a primary focus on accuracy, and increasing attention +to reducing computational overhead and improving model interpretability. +However, an important question remains underexplored: How well can machine +learning-based models detect entirely new forms of malware not present in the +training data? In this study, we present a machine learning-based system for +detecting obfuscated malware that is not only highly accurate, lightweight and +interpretable, but also capable of successfully adapting to new types of +malware attacks. Our system is capable of detecting 15 malware subtypes despite +being exclusively trained on one malware subtype, namely the Transponder from +the Spyware family. This system was built after training 15 distinct random +forest-based models, each on a different malware subtype from the +CIC-MalMem-2022 dataset. These models were evaluated against the entire range +of malware subtypes, including all unseen malware subtypes. To maintain the +system's streamlined nature, training was confined to the top five most +important features, which also enhanced interpretability. The +Transponder-focused model exhibited high accuracy, exceeding 99.8%, with an +average processing speed of 5.7 microseconds per file. We also illustrate how +the Shapley additive explanations technique can facilitate the interpretation +of the model predictions. Our research contributes to advancing malware +detection methodologies, pioneering the feasibility of detecting obfuscated +malware by exclusively training a model on a single or a few carefully selected +malware subtypes and applying it to detect unseen subtypes. + +
+
+ comment: 30 pages (excluding Appendix), 5 figures and 5 tables. Now published + in Intelligent Systems with Applications + (https://doi.org/10.1016/j.iswa.2024.200472) +
+
+
+
+
+ + ♻ ☆ Nature Language Model: Deciphering the Language of Nature for Scientific + Discovery + + +
+ Foundation models have revolutionized natural language processing and +artificial intelligence, significantly enhancing how machines comprehend and +generate human languages. Inspired by the success of these foundation models, +researchers have developed foundation models for individual scientific domains, +including small molecules, materials, proteins, DNA, RNA and even cells. +However, these models are typically trained in isolation, lacking the ability +to integrate across different scientific domains. Recognizing that entities +within these domains can all be represented as sequences, which together form +the "language of nature", we introduce Nature Language Model (NatureLM), a +sequence-based science foundation model designed for scientific discovery. +Pre-trained with data from multiple scientific domains, NatureLM offers a +unified, versatile model that enables various applications including: (i) +generating and optimizing small molecules, proteins, RNA, and materials using +text instructions; (ii) cross-domain generation/design, such as +protein-to-molecule and protein-to-RNA generation; and (iii) top performance +across different domains, matching or surpassing state-of-the-art specialist +models. NatureLM offers a promising generalist approach for various scientific +tasks, including drug discovery (hit generation/optimization, ADMET +optimization, synthesis), novel material design, and the development of +therapeutic proteins or nucleotides. We have developed NatureLM models in +different sizes (1 billion, 8 billion, and 46.7 billion parameters) and +observed a clear improvement in performance as the model size increases. + +
+
+ comment: 93 pages +
+
+
+
+
+ + ♻ ☆ Deep unrolling for learning optimal spatially varying regularisation + parameters for Total Generalised Variation + + +
+ We extend a recently introduced deep unrolling framework for learning +spatially varying regularisation parameters in inverse imaging problems to the +case of Total Generalised Variation (TGV). The framework combines a deep +convolutional neural network (CNN) inferring the two spatially varying TGV +parameters with an unrolled algorithmic scheme that solves the corresponding +variational problem. The two subnetworks are jointly trained end-to-end in a +supervised fashion and as such the CNN learns to compute those parameters that +drive the reconstructed images as close to the ground truth as possible. +Numerical results in image denoising and MRI reconstruction show a significant +qualitative and quantitative improvement compared to the best TGV scalar +parameter case as well as to other approaches employing spatially varying +parameters computed by unsupervised methods. We also observe that the inferred +spatially varying parameter maps have a consistent structure near the image +edges, asking for further theoretical investigations. In particular, the +parameter that weighs the first-order TGV term has a triple-edge structure with +alternating high-low-high values whereas the one that weighs the second-order +term attains small values in a large neighbourhood around the edges. + +
+
+
+
+
+ + ♻ ☆ HelpSteer2-Preference: Complementing Ratings with Preferences ICLR 2025 + + +
+ Reward models are critical for aligning models to follow instructions, and +are typically trained following one of two popular paradigms: Bradley-Terry +style or Regression style. However, there is a lack of evidence that either +approach is better than the other, when adequately matched for data. This is +primarily because these approaches require data collected in different (but +incompatible) formats, meaning that adequately matched data is not available in +existing public datasets. To tackle this problem, we release preference +annotations (designed for Bradley-Terry training) to complement existing +ratings (designed for Regression style training) in the HelpSteer2 dataset. To +improve data interpretability, preference annotations are accompanied with +human-written justifications. Using this data, we conduct the first +head-to-head comparison of Bradley-Terry and Regression models when adequately +matched for data. Based on insights derived from such a comparison, we propose +a novel approach to combine Bradley-Terry and Regression reward modeling. A +Llama-3.1-70B-Instruct model tuned with this approach scores 94.1 on +RewardBench, emerging top of more than 140 reward models as of 1 Oct 2024. This +reward model can then be used with REINFORCE algorithm (RLHF) to align an +Instruct model to reach 85.0 on Arena Hard, which is No. 1 as of 1 Oct 2024. We +open-source this dataset (CC-BY-4.0 license) at +https://huggingface.co/datasets/nvidia/HelpSteer2#preferences-new -- 1-oct-2024 +and openly release the trained Reward and Instruct models at +https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Reward and +https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct + +
+
+ comment: Accepted to ICLR 2025; 28 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Cross-Modal Prototype based Multimodal Federated Learning under Severely + Missing Modality + + +
+ Multimodal federated learning (MFL) has emerged as a decentralized machine +learning paradigm, allowing multiple clients with different modalities to +collaborate on training a global model across diverse data sources without +sharing their private data. However, challenges, such as data heterogeneity and +severely missing modalities, pose crucial hindrances to the robustness of MFL, +significantly impacting the performance of global model. The occurrence of +missing modalities in real-world applications, such as autonomous driving, +often arises from factors like sensor failures, leading knowledge gaps during +the training process. Specifically, the absence of a modality introduces +misalignment during the local training phase, stemming from zero-filling in the +case of clients with missing modalities. Consequently, achieving robust +generalization in global model becomes imperative, especially when dealing with +clients that have incomplete data. In this paper, we propose +$\textbf{Multimodal Federated Cross Prototype Learning (MFCPL)}$, a novel +approach for MFL under severely missing modalities. Our MFCPL leverages the +complete prototypes to provide diverse modality knowledge in modality-shared +level with the cross-modal regularization and modality-specific level with +cross-modal contrastive mechanism. Additionally, our approach introduces the +cross-modal alignment to provide regularization for modality-specific features, +thereby enhancing the overall performance, particularly in scenarios involving +severely missing modalities. Through extensive experiments on three multimodal +datasets, we demonstrate the effectiveness of MFCPL in mitigating the +challenges of data heterogeneity and severely missing modalities while +improving the overall performance and robustness of MFL. + +
+
+ comment: 14 pages, 8 figures, 11 tables +
+
+
+
+
+ + ♻ ☆ Extracting Formulae in Many-Valued Logic from Deep Neural Networks + + +
+ We propose a new perspective on deep ReLU networks, namely as circuit +counterparts of Lukasiewicz infinite-valued logic -- a many-valued (MV) +generalization of Boolean logic. An algorithm for extracting formulae in MV +logic from deep ReLU networks is presented. As the algorithm applies to +networks with general, in particular also real-valued, weights, it can be used +to extract logical formulae from deep ReLU networks trained on data. + +
+
+ comment: Signicant extension of the previous version +
+
+
+
+
+ + ♻ ☆ Learning truly monotone operators with applications to nonlinear inverse + problems + + +
+ This article introduces a novel approach to learning monotone neural networks +through a newly defined penalization loss. The proposed method is particularly +effective in solving classes of variational problems, specifically monotone +inclusion problems, commonly encountered in image processing tasks. The +Forward-Backward-Forward (FBF) algorithm is employed to address these problems, +offering a solution even when the Lipschitz constant of the neural network is +unknown. Notably, the FBF algorithm provides convergence guarantees under the +condition that the learned operator is monotone. Building on plug-and-play +methodologies, our objective is to apply these newly learned operators to +solving non-linear inverse problems. To achieve this, we initially formulate +the problem as a variational inclusion problem. Subsequently, we train a +monotone neural network to approximate an operator that may not inherently be +monotone. Leveraging the FBF algorithm, we then show simulation examples where +the non-linear inverse problem is successfully solved. + +
+
+
+
+
+ + ♻ ☆ A conversion theorem and minimax optimality for continuum contextual + bandits + + +
+ We study the contextual continuum bandits problem, where the learner +sequentially receives a side information vector and has to choose an action in +a convex set, minimizing a function associated with the context. The goal is to +minimize all the underlying functions for the received contexts, leading to the +contextual notion of regret, which is stronger than the standard static regret. +Assuming that the objective functions are $\gamma$-H\"older with respect to the +contexts, $0<\gamma\le 1,$ we demonstrate that any algorithm achieving a +sub-linear static regret can be extended to achieve a sub-linear contextual +regret. We prove a static-to-contextual regret conversion theorem that provides +an upper bound for the contextual regret of the output algorithm as a function +of the static regret of the input algorithm. We further study the implications +of this general result for three fundamental cases of dependency of the +objective function on the action variable: (a) Lipschitz bandits, (b) convex +bandits, (c) strongly convex and smooth bandits. For Lipschitz bandits and +$\gamma=1,$ combining our results with the lower bound of Slivkins (2014), we +prove that the minimax optimal contextual regret for the noise-free adversarial +setting is achieved. Then, we prove that in the presence of noise, the +contextual regret rate as a function of the number of queries is the same for +convex bandits as it is for strongly convex and smooth bandits. Lastly, we +present a minimax lower bound, implying two key facts. First, obtaining a +sub-linear contextual regret may be impossible over functions that are not +continuous with respect to the context. Second, for convex bandits and strongly +convex and smooth bandits, the algorithms that we propose achieve, up to a +logarithmic factor, the minimax optimal rate of contextual regret as a function +of the number of queries. + +
+
+
+
+
+ + ♻ ☆ Federated Learning With Individualized Privacy Through Client Sampling ICML + + +
+ With growing concerns about user data collection, individualized privacy has +emerged as a promising solution to balance protection and utility by accounting +for diverse user privacy preferences. Instead of enforcing a uniform level of +anonymization for all users, this approach allows individuals to choose privacy +settings that align with their comfort levels. Building on this idea, we +propose an adapted method for enabling Individualized Differential Privacy +(IDP) in Federated Learning (FL) by handling clients according to their +personal privacy preferences. By extending the SAMPLE algorithm from +centralized settings to FL, we calculate client-specific sampling rates based +on their heterogeneous privacy budgets and integrate them into a modified +IDP-FedAvg algorithm. We test this method under realistic privacy distributions +and multiple datasets. The experimental results demonstrate that our approach +achieves clear improvements over uniform DP baselines, reducing the trade-off +between privacy and utility. Compared to the alternative SCALE method in +related work, which assigns differing noise scales to clients, our method +performs notably better. However, challenges remain for complex tasks with +non-i.i.d. data, primarily stemming from the constraints of the decentralized +setting. + +
+
+ comment: Accepted at 10th International Conference on Machine Learning + Technologies (ICMLT 2025) +
+
+
+
+
+ + ♻ ☆ Decentralized Sporadic Federated Learning: A Unified Algorithmic + Framework with Convergence Guarantees + + +
+ Decentralized federated learning (DFL) captures FL settings where both (i) +model updates and (ii) model aggregations are exclusively carried out by the +clients without a central server. Existing DFL works have mostly focused on +settings where clients conduct a fixed number of local updates between local +model exchanges, overlooking heterogeneity and dynamics in communication and +computation capabilities. In this work, we propose Decentralized Sporadic +Federated Learning ($\texttt{DSpodFL}$), a DFL methodology built on a +generalized notion of $\textit{sporadicity}$ in both local gradient and +aggregation processes. $\texttt{DSpodFL}$ subsumes many existing decentralized +optimization methods under a unified algorithmic framework by modeling the +per-iteration (i) occurrence of gradient descent at each client and (ii) +exchange of models between client pairs as arbitrary indicator random +variables, thus capturing $\textit{heterogeneous and time-varying}$ +computation/communication scenarios. We analytically characterize the +convergence behavior of $\texttt{DSpodFL}$ for both convex and non-convex +models and for both constant and diminishing learning rates, under mild +assumptions on the communication graph connectivity, data heterogeneity across +clients, and gradient noises. We show how our bounds recover existing results +from decentralized gradient descent as special cases. Experiments demonstrate +that $\texttt{DSpodFL}$ consistently achieves improved training speeds compared +with baselines under various system settings. + +
+
+
+
+
+ + ♻ ☆ Social Genome: Grounded Social Reasoning Abilities of Multimodal Models + + +
+ Social reasoning abilities are crucial for AI systems to effectively +interpret and respond to multimodal human communication and interaction within +social contexts. We introduce Social Genome, the first benchmark for +fine-grained, grounded social reasoning abilities of multimodal models. Social +Genome contains 272 videos of interactions and 1,486 human-annotated reasoning +traces related to inferences about these interactions. These traces contain +5,777 reasoning steps that reference evidence from visual cues, verbal cues, +vocal cues, and external knowledge (contextual knowledge external to videos). +Social Genome is also the first modeling challenge to study external knowledge +in social reasoning. Social Genome computes metrics to holistically evaluate +semantic and structural qualities of model-generated social reasoning traces. +We demonstrate the utility of Social Genome through experiments with +state-of-the-art models, identifying performance gaps and opportunities for +future research to improve the grounded social reasoning abilities of +multimodal models. + +
+
+ comment: Under Review, 22 pages +
+
+
+
+
+ + ♻ ☆ $σ$-zero: Gradient-based Optimization of $\ell_0$-norm Adversarial + Examples ICLR 2025 + + +
+ Evaluating the adversarial robustness of deep networks to gradient-based +attacks is challenging. While most attacks consider $\ell_2$- and +$\ell_\infty$-norm constraints to craft input perturbations, only a few +investigate sparse $\ell_1$- and $\ell_0$-norm attacks. In particular, +$\ell_0$-norm attacks remain the least studied due to the inherent complexity +of optimizing over a non-convex and non-differentiable constraint. However, +evaluating adversarial robustness under these attacks could reveal weaknesses +otherwise left untested with more conventional $\ell_2$- and $\ell_\infty$-norm +attacks. In this work, we propose a novel $\ell_0$-norm attack, called +$\sigma$-zero, which leverages a differentiable approximation of the $\ell_0$ +norm to facilitate gradient-based optimization, and an adaptive projection +operator to dynamically adjust the trade-off between loss minimization and +perturbation sparsity. Extensive evaluations using MNIST, CIFAR10, and ImageNet +datasets, involving robust and non-robust models, show that +$\sigma$\texttt{-zero} finds minimum $\ell_0$-norm adversarial examples without +requiring any time-consuming hyperparameter tuning, and that it outperforms all +competing sparse attacks in terms of success rate, perturbation size, and +efficiency. + +
+
+ comment: Paper accepted at International Conference on Learning + Representations (ICLR 2025). Code available at + https://github.com/sigma0-advx/sigma-zero +
+
+
+
+
+ + ♻ ☆ VISION-XL: High Definition Video Inverse Problem Solver using Latent + Image Diffusion Models + + +
+ In this paper, we propose a novel framework for solving high-definition video +inverse problems using latent image diffusion models. Building on recent +advancements in spatio-temporal optimization for video inverse problems using +image diffusion models, our approach leverages latent-space diffusion models to +achieve enhanced video quality and resolution. To address the high +computational demands of processing high-resolution frames, we introduce a +pseudo-batch consistent sampling strategy, allowing efficient operation on a +single GPU. Additionally, to improve temporal consistency, we present +pseudo-batch inversion, an initialization technique that incorporates +informative latents from the measurement. By integrating with SDXL, our +framework achieves state-of-the-art video reconstruction across a wide range of +spatio-temporal inverse problems, including complex combinations of frame +averaging and various spatial degradations, such as deblurring, +super-resolution, and inpainting. Unlike previous methods, our approach +supports multiple aspect ratios (landscape, vertical, and square) and delivers +HD-resolution reconstructions (exceeding 1280x720) in under 6 seconds per frame +on a single NVIDIA 4090 GPU. + +
+
+ comment: Project page: https://vision-xl.github.io/ +
+
+
+
+
+ + ♻ ☆ No More Sliding Window: Efficient 3D Medical Image Segmentation with + Differentiable Top-k Patch Sampling + + +
+ 3D models surpass 2D models in CT/MRI segmentation by effectively capturing +inter-slice relationships. However, the added depth dimension substantially +increases memory consumption. While patch-based training alleviates memory +constraints, it significantly slows down the inference speed due to the sliding +window (SW) approach. We propose No-More-Sliding-Window (NMSW), a novel +end-to-end trainable framework that enhances the efficiency of generic 3D +segmentation backbone during an inference step by eliminating the need for SW. +NMSW employs a differentiable Top-k module to selectively sample only the most +relevant patches, thereby minimizing redundant computations. When patch-level +predictions are insufficient, the framework intelligently leverages coarse +global predictions to refine results. Evaluated across 3 tasks using 3 +segmentation backbones, NMSW achieves competitive accuracy compared to SW +inference while significantly reducing computational complexity by 91% (88.0 to +8.00 TMACs). Moreover, it delivers a 9.1x faster inference on the H100 GPU +(99.0 to 8.3 sec) and a 11.1x faster inference on the Xeon Gold CPU (2110 to +189 sec). NMSW is model-agnostic, further boosting efficiency when integrated +with any existing efficient segmentation backbones. + +
+
+
+
+
+ + ♻ ☆ Unveiling the Power of Noise Priors: Enhancing Diffusion Models for + Mobile Traffic Prediction + + +
+ Accurate prediction of mobile traffic, \textit{i.e.,} network traffic from +cellular base stations, is crucial for optimizing network performance and +supporting urban development. However, the non-stationary nature of mobile +traffic, driven by human activity and environmental changes, leads to both +regular patterns and abrupt variations. Diffusion models excel in capturing +such complex temporal dynamics due to their ability to capture the inherent +uncertainties. Most existing approaches prioritize designing novel denoising +networks but often neglect the critical role of noise itself, potentially +leading to sub-optimal performance. In this paper, we introduce a novel +perspective by emphasizing the role of noise in the denoising process. Our +analysis reveals that noise fundamentally shapes mobile traffic predictions, +exhibiting distinct and consistent patterns. We propose NPDiff, a framework +that decomposes noise into \textit{prior} and \textit{residual} components, +with the \textit{prior} derived from data dynamics, enhancing the model's +ability to capture both regular and abrupt variations. NPDiff can seamlessly +integrate with various diffusion-based prediction models, delivering +predictions that are effective, efficient, and robust. Extensive experiments +demonstrate that it achieves superior performance with an improvement over +30\%, offering a new perspective on leveraging diffusion models in this domain. + +
+
+
+
+
+ + ♻ ☆ KAGNNs: Kolmogorov-Arnold Networks meet Graph Learning + + +
+ In recent years, Graph Neural Networks (GNNs) have become the de facto tool +for learning node and graph representations. Most GNNs typically consist of a +sequence of neighborhood aggregation (a.k.a., message-passing) layers, within +which the representation of each node is updated based on those of its +neighbors. The most expressive message-passing GNNs can be obtained through the +use of the sum aggregator and of MLPs for feature transformation, thanks to +their universal approximation capabilities. However, the limitations of MLPs +recently motivated the introduction of another family of universal +approximators, called Kolmogorov-Arnold Networks (KANs) which rely on a +different representation theorem. In this work, we compare the performance of +KANs against that of MLPs on graph learning tasks. We implement three new +KAN-based GNN layers, inspired respectively by the GCN, GAT and GIN layers. We +evaluate two different implementations of KANs using two distinct base families +of functions, namely B-splines and radial basis functions. We perform extensive +experiments on node classification, link prediction, graph classification and +graph regression datasets. Our results indicate that KANs are on-par with or +better than MLPs on all tasks studied in this paper. We also show that the size +and training speed of RBF-based KANs is only marginally higher than for MLPs, +making them viable alternatives. Code available at +https://github.com/RomanBresson/KAGNN. + +
+
+
+
+
+ + ♻ ☆ Graph Neural Networks for Edge Signals: Orientation Equivariance and + Invariance + + +
+ Many applications in traffic, civil engineering, or electrical engineering +revolve around edge-level signals. Such signals can be categorized as +inherently directed, for example, the water flow in a pipe network, and +undirected, like the diameter of a pipe. Topological methods model edge signals +with inherent direction by representing them relative to a so-called +orientation assigned to each edge. These approaches can neither model +undirected edge signals nor distinguish if an edge itself is directed or +undirected. We address these shortcomings by (i) revising the notion of +orientation equivariance to enable edge direction-aware topological models, +(ii) proposing orientation invariance as an additional requirement to describe +signals without inherent direction, and (iii) developing EIGN, an architecture +composed of novel direction-aware edge-level graph shift operators, that +provably fulfills the aforementioned desiderata. It is the first +general-purpose topological GNN for edge-level signals that can model directed +and undirected signals while distinguishing between directed and undirected +edges. A comprehensive evaluation shows that EIGN outperforms prior work in +edge-level tasks, for example, improving in RMSE on flow simulation tasks by up +to 23.5%. + +
+
+
+
+
+ + ♻ ☆ GIFT: Unlocking Full Potential of Labels in Distilled Dataset at + Near-zero Cost + + +
+ Recent advancements in dataset distillation have demonstrated the significant +benefits of employing soft labels generated by pre-trained teacher models. In +this paper, we introduce a novel perspective by emphasizing the full +utilization of labels. We first conduct a comprehensive comparison of various +loss functions for soft label utilization in dataset distillation, revealing +that the model trained on the synthetic dataset exhibits high sensitivity to +the choice of loss function for soft label utilization. This finding highlights +the necessity of a universal loss function for training models on synthetic +datasets. Building on these insights, we introduce an extremely simple yet +surprisingly effective plug-and-play approach, GIFT, which encompasses soft +label refinement and a cosine similarity-based loss function to efficiently +leverage full label information. Extensive experiments indicate that GIFT +consistently enhances state-of-the-art dataset distillation methods across +various dataset scales, without incurring additional computational costs. +Importantly, GIFT significantly enhances cross-optimizer generalization, an +area previously overlooked. For instance, on ImageNet-1K with IPC = 10, GIFT +enhances the state-of-the-art method RDED by 30.8% in cross-optimizer +generalization. Our code is available at https://github.com/LINs-lab/GIFT. + +
+
+ comment: https://github.com/LINs-lab/GIFT +
+
+
+
+
+ + ♻ ☆ Estimation of multiple mean vectors in high dimension + + +
+ We endeavour to estimate numerous multi-dimensional means of various +probability distributions on a common space based on independent samples. Our +approach involves forming estimators through convex combinations of empirical +means derived from these samples. We introduce two strategies to find +appropriate data-dependent convex combination weights: a first one employing a +testing procedure to identify neighbouring means with low variance, which +results in a closed-form plug-in formula for the weights, and a second one +determining weights via minimization of an upper confidence bound on the +quadratic risk.Through theoretical analysis, we evaluate the improvement in +quadratic risk offered by our methods compared to the empirical means. Our +analysis focuses on a dimensional asymptotics perspective, showing that our +methods asymptotically approach an oracle (minimax) improvement as the +effective dimension of the data increases.We demonstrate the efficacy of our +methods in estimating multiple kernel mean embeddings through experiments on +both simulated and real-world datasets. + +
+
+
+
+
+ + ♻ ☆ Assessing Pre-Trained Models for Transfer Learning Through Distribution + of Spectral Components AAAI 2025 + + +
+ Pre-trained model assessment for transfer learning aims to identify the +optimal candidate for the downstream tasks from a model hub, without the need +of time-consuming fine-tuning. Existing advanced works mainly focus on +analyzing the intrinsic characteristics of the entire features extracted by +each pre-trained model or how well such features fit the target labels. This +paper proposes a novel perspective for pre-trained model assessment through the +Distribution of Spectral Components (DISCO). Through singular value +decomposition of features extracted from pre-trained models, we investigate +different spectral components and observe that they possess distinct +transferability, contributing diversely to the fine-tuning performance. +Inspired by this, we propose an assessment method based on the distribution of +spectral components which measures the proportions of their corresponding +singular values. Pre-trained models with features concentrating on more +transferable components are regarded as better choices for transfer learning. +We further leverage the labels of downstream data to better estimate the +transferability of each spectral component and derive the final assessment +criterion. Our proposed method is flexible and can be applied to both +classification and regression tasks. We conducted comprehensive experiments +across three benchmarks and two tasks including image classification and object +detection, demonstrating that our method achieves state-of-the-art performance +in choosing proper pre-trained models from the model hub for transfer learning. + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Reinforcement Learning on Reconfigurable Hardware: Overcoming Material + Variability in Laser Material Processing ICRA + + +
+ Ensuring consistent processing quality is challenging in laser processes due +to varying material properties and surface conditions. Although some approaches +have shown promise in solving this problem via automation, they often rely on +predetermined targets or are limited to simulated environments. To address +these shortcomings, we propose a novel real-time reinforcement learning +approach for laser process control, implemented on a Field Programmable Gate +Array to achieve real-time execution. Our experimental results from laser +welding tests on stainless steel samples with a range of surface roughnesses +validated the method's ability to adapt autonomously, without relying on reward +engineering or prior setup information. Specifically, the algorithm learned the +correct power profile for each unique surface characteristic, demonstrating +significant improvements over hand-engineered optimal constant power strategies +-- up to 23% better performance on rougher surfaces and 7% on mixed surfaces. +This approach represents a significant advancement in automating and optimizing +laser processes, with potential applications across multiple industries. + +
+
+ comment: Accepted for the 2025 IEEE International Conference on Robotics and + Automation (ICRA), May 19-23, 2025, Atlanta, USA; Camera ready version -- + addressed reviewer comments in text, improved plot clarity +
+
+
+
+
+ + ♻ Wasserstein-regularized Conformal Prediction under General Distribution + Shift + + +
+ Conformal prediction yields a prediction set with guaranteed $1-\alpha$ +coverage of the true target under the i.i.d. assumption, which may not hold and +lead to a gap between $1-\alpha$ and the actual coverage. Prior studies bound +the gap using total variation distance, which cannot identify the gap changes +under distribution shift at a given $\alpha$. Besides, existing methods are +mostly limited to covariate shift,while general joint distribution shifts are +more common in practice but less researched.In response, we first propose a +Wasserstein distance-based upper bound of the coverage gap and analyze the +bound using probability measure pushforwards between the shifted joint data and +conformal score distributions, enabling a separation of the effect of covariate +and concept shifts over the coverage gap. We exploit the separation to design +an algorithm based on importance weighting and regularized representation +learning (WR-CP) to reduce the Wasserstein bound with a finite-sample error +bound.WR-CP achieves a controllable balance between conformal prediction +accuracy and efficiency. Experiments on six datasets prove that WR-CP can +reduce coverage gaps to $3.2\%$ across different confidence levels and outputs +prediction sets 37$\%$ smaller than the worst-case approach on average. + +
+
+
+
+
+ + ♻ ☆ Mixed Graph Contrastive Network for Semi-Supervised Node Classification + + +
+ Graph Neural Networks (GNNs) have achieved promising performance in +semi-supervised node classification in recent years. However, the problem of +insufficient supervision, together with representation collapse, largely limits +the performance of the GNNs in this field. To alleviate the collapse of node +representations in semi-supervised scenario, we propose a novel graph +contrastive learning method, termed Mixed Graph Contrastive Network (MGCN). In +our method, we improve the discriminative capability of the latent embeddings +by an interpolation-based augmentation strategy and a correlation reduction +mechanism. Specifically, we first conduct the interpolation-based augmentation +in the latent space and then force the prediction model to change linearly +between samples. Second, we enable the learned network to tell apart samples +across two interpolation-perturbed views through forcing the correlation matrix +across views to approximate an identity matrix. By combining the two settings, +we extract rich supervision information from both the abundant unlabeled nodes +and the rare yet valuable labeled nodes for discriminative representation +learning. Extensive experimental results on six datasets demonstrate the +effectiveness and the generality of MGCN compared to the existing +state-of-the-art methods. The code of MGCN is available at +https://github.com/xihongyang1999/MGCN on Github. + +
+
+
+
+
+ + ♻ ☆ Rethinking Weight-Averaged Model-merging + + +
+ Model-merging has emerged as a powerful approach in deep learning, capable of +enhancing model performance without any training. However, the underlying +mechanisms that explain its effectiveness remain largely unexplored. In this +paper, we investigate this technique from three novel perspectives to +empirically provide deeper insights into why and how weight-averaged +model-merging works: (1) we examine the intrinsic patterns captured by the +learning of the model weights, through the visualizations of their patterns on +several datasets, showing that these weights often encode structured and +interpretable patterns and that is the essential why model-merging can work; +(2) we mathematically and empirically investigate model ensemble merging +strategies based on averaging on weights versus averaging on features, +providing detailed analyses across diverse architectures and datasets; and (3) +we explore the impact on model-merging prediction stability in terms of +changing the parameter magnitude, revealing insights into the way of weight +averaging works as regularization by showing the robustness across different +parameter scales. Our findings shed light on the "black box" of weight-averaged +model-merging, offering valuable insights and practical recommendations that +advance the model-merging process. The code is available at +https://github.com/billhhh/Rethink-Merge. + +
+
+
+
+
+ + ♻ ☆ Careful with that Scalpel: Improving Gradient Surgery with an EMA + + +
+ Beyond minimizing a single training loss, many deep learning estimation +pipelines rely on an auxiliary objective to quantify and encourage desirable +properties of the model (e.g. performance on another dataset, robustness, +agreement with a prior). Although the simplest approach to incorporating an +auxiliary loss is to sum it with the training loss as a regularizer, recent +works have shown that one can improve performance by blending the gradients +beyond a simple sum; this is known as gradient surgery. We cast the problem as +a constrained minimization problem where the auxiliary objective is minimized +among the set of minimizers of the training loss. To solve this bilevel +problem, we follow a parameter update direction that combines the training loss +gradient and the orthogonal projection of the auxiliary gradient to the +training gradient. In a setting where gradients come from mini-batches, we +explain how, using a moving average of the training loss gradients, we can +carefully maintain this critical orthogonality property. We demonstrate that +our method, Bloop, can lead to much better performances on NLP and vision +experiments than other gradient surgery methods without EMA. + +
+
+
+
+
+ + ♻ ☆ Union of Experts: Adapting Hierarchical Routing to Equivalently + Decomposed Transformer + + +
+ We propose Union-of-Experts (UoE), which decomposes transformer into an +equitant group of experts, and then implement selective routing on input data +and experts. Our approach advances MoE design with four key innovations: (1) We +conducted equitant expert decomposition on both MLP blocks and attention blocks +based on matrix partition in tensor parallelism. (2) We developed two routing +paradigms: patch-wise data selection and expert selection, to apply routing +across different levels. (3) We design the architecture of UoE model, including +Selective Multi-Head Attention (SMHA) and Union-of-MLP-Experts (UoME). (4) We +develop parallel implementation of UoE's routing and computation operation, and +optimize efficiency based on the hardware processing analysis. The experiments +demonstrate that the UoE model surpass Full Attention, state-of-art MoEs and +efficient transformers (including the model architecture of recently proposed +DeepSeek-V3) in several tasks across image and natural language domains. In +language modeling tasks, we achieve an average reduction of 2.38 in perplexity +compared to the best-performed MoE method with an average of 76% FLOPs. In Long +Range Arena benchmark, we recorded an average score that is at least 0.68% +higher than all comparison models including Full Attention, MoEs, and +transformer variants, with only 50% FLOPs of the best MoE method. In image +classification, our model yielded an average accuracy improvement of 1.75% than +the best model while maintaining comparable FLOPs. The source codes are +available at https://github.com/YujiaoYang-work/UoE. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ♻ ☆ CE-U: Cross Entropy Unlearning + + +
+ Large language models (LLMs) inadvertently memorize sensitive data from their +massive pretraining corpora \cite{jang2022knowledge}. In this work, we propose +CE-U (Cross Entropy Unlearning), a novel loss function designed specifically +for unlearning tasks. CE-U addresses fundamental limitations of gradient ascent +approaches which suffer from instability due to vanishing gradients when model +confidence is high and gradient exploding when confidence is low. We also unify +standard cross entropy supervision and cross entropy unlearning into a single +framework. Notably, on the TOFU benchmark for unlearning \cite{maini2024tofu}, +CE-U achieves state-of-the-art results on LLaMA2-7B with 1\% and 5\% +forgetting, even without the use of any extra reference model or additional +positive samples. Our theoretical analysis further reveals that the gradient +instability issues also exist in popular reinforcement learning algorithms like +DPO \cite{rafailov2023direct} and GRPO\cite{Shao2024DeepSeekMath}, as they +include a gradient ascent component. This suggests that applying CE-U +principles to reinforcement learning could be a promising direction for +improving stability and convergence. + +
+
+
+
+
+ + ♻ ☆ Comparing hundreds of machine learning classifiers and discrete choice + models in predicting travel behavior: an empirical benchmark + + +
+ Numerous studies have compared machine learning (ML) and discrete choice +models (DCMs) in predicting travel demand. However, these studies often lack +generalizability as they compare models deterministically without considering +contextual variations. To address this limitation, our study develops an +empirical benchmark by designing a tournament model, thus efficiently +summarizing a large number of experiments, quantifying the randomness in model +comparisons, and using formal statistical tests to differentiate between the +model and contextual effects. This benchmark study compares two large-scale +data sources: a database compiled from literature review summarizing 136 +experiments from 35 studies, and our own experiment data, encompassing a total +of 6,970 experiments from 105 models and 12 model families. This benchmark +study yields two key findings. Firstly, many ML models, particularly the +ensemble methods and deep learning, statistically outperform the DCM family +(i.e., multinomial, nested, and mixed logit models). However, this study also +highlights the crucial role of the contextual factors (i.e., data sources, +inputs and choice categories), which can explain models' predictive performance +more effectively than the differences in model types alone. Model performance +varies significantly with data sources, improving with larger sample sizes and +lower dimensional alternative sets. After controlling all the model and +contextual factors, significant randomness still remains, implying inherent +uncertainty in such model comparisons. Overall, we suggest that future +researchers shift more focus from context-specific model comparisons towards +examining model transferability across contexts and characterizing the inherent +uncertainty in ML, thus creating more robust and generalizable next-generation +travel demand models. + +
+
+
+
+
+
+
+
+ + Multimedia 1 + +
+
+
+ + ☆ SMTPD: A New Benchmark for Temporal Prediction of Social Media + Popularity CVPR 2025 + + +
+ Social media popularity prediction task aims to predict the popularity of +posts on social media platforms, which has a positive driving effect on +application scenarios such as content optimization, digital marketing and +online advertising. Though many studies have made significant progress, few of +them pay much attention to the integration between popularity prediction with +temporal alignment. In this paper, with exploring YouTube's multilingual and +multi-modal content, we construct a new social media temporal popularity +prediction benchmark, namely SMTPD, and suggest a baseline framework for +temporal popularity prediction. Through data analysis and experiments, we +verify that temporal alignment and early popularity play crucial roles in +social media popularity prediction for not only deepening the understanding of +temporal dynamics of popularity in social media but also offering a suggestion +about developing more effective prediction models in this field. Code is +available at https://github.com/zhuwei321/SMTPD. + +
+
+ comment: accept by CVPR 2025 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 149 + +
+
+
+ + ☆ GEN3C: 3D-Informed World-Consistent Video Generation with Precise Camera + Control CVPR 2025 + + +
+ We present GEN3C, a generative video model with precise Camera Control and +temporal 3D Consistency. Prior video models already generate realistic videos, +but they tend to leverage little 3D information, leading to inconsistencies, +such as objects popping in and out of existence. Camera control, if implemented +at all, is imprecise, because camera parameters are mere inputs to the neural +network which must then infer how the video depends on the camera. In contrast, +GEN3C is guided by a 3D cache: point clouds obtained by predicting the +pixel-wise depth of seed images or previously generated frames. When generating +the next frames, GEN3C is conditioned on the 2D renderings of the 3D cache with +the new camera trajectory provided by the user. Crucially, this means that +GEN3C neither has to remember what it previously generated nor does it have to +infer the image structure from the camera pose. The model, instead, can focus +all its generative power on previously unobserved regions, as well as advancing +the scene state to the next frame. Our results demonstrate more precise camera +control than prior work, as well as state-of-the-art results in sparse-view +novel view synthesis, even in challenging settings such as driving scenes and +monocular dynamic video. Results are best viewed in videos. Check out our +webpage! https://research.nvidia.com/labs/toronto-ai/GEN3C/ + +
+
+ comment: To appear in CVPR 2025. Website: + https://research.nvidia.com/labs/toronto-ai/GEN3C/ +
+
+
+
+
+ + ☆ OTTER: A Vision-Language-Action Model with Text-Aware Visual Feature + Extraction + + +
+ Vision-Language-Action (VLA) models aim to predict robotic actions based on +visual observations and language instructions. Existing approaches require +fine-tuning pre-trained visionlanguage models (VLMs) as visual and language +features are independently fed into downstream policies, degrading the +pre-trained semantic alignments. We propose OTTER, a novel VLA architecture +that leverages these existing alignments through explicit, text-aware visual +feature extraction. Instead of processing all visual features, OTTER +selectively extracts and passes only task-relevant visual features that are +semantically aligned with the language instruction to the policy transformer. +This allows OTTER to keep the pre-trained vision-language encoders frozen. +Thereby, OTTER preserves and utilizes the rich semantic understanding learned +from large-scale pre-training, enabling strong zero-shot generalization +capabilities. In simulation and real-world experiments, OTTER significantly +outperforms existing VLA models, demonstrating strong zeroshot generalization +to novel objects and environments. Video, code, checkpoints, and dataset: +https://ottervla.github.io/. + +
+
+
+
+
+ + ☆ Rethinking Deep Clustering Paradigms: Self-Supervision Is All You Need + + +
+ The recent advances in deep clustering have been made possible by significant +progress in self-supervised and pseudo-supervised learning. However, the +trade-off between self-supervision and pseudo-supervision can give rise to +three primary issues. The joint training causes Feature Randomness and Feature +Drift, whereas the independent training causes Feature Randomness and Feature +Twist. In essence, using pseudo-labels generates random and unreliable +features. The combination of pseudo-supervision and self-supervision drifts the +reliable clustering-oriented features. Moreover, moving from self-supervision +to pseudo-supervision can twist the curved latent manifolds. This paper +addresses the limitations of existing deep clustering paradigms concerning +Feature Randomness, Feature Drift, and Feature Twist. We propose a new paradigm +with a new strategy that replaces pseudo-supervision with a second round of +self-supervision training. The new strategy makes the transition between +instance-level self-supervision and neighborhood-level self-supervision +smoother and less abrupt. Moreover, it prevents the drifting effect that is +caused by the strong competition between instance-level self-supervision and +clustering-level pseudo-supervision. Moreover, the absence of the +pseudo-supervision prevents the risk of generating random features. With this +novel approach, our paper introduces a Rethinking of the Deep Clustering +Paradigms, denoted by R-DC. Our model is specifically designed to address three +primary challenges encountered in Deep Clustering: Feature Randomness, Feature +Drift, and Feature Twist. Experimental results conducted on six datasets have +shown that the two-level self-supervision training yields substantial +improvements. + +
+
+
+
+
+ + ☆ Active 6D Pose Estimation for Textureless Objects using Multi-View RGB + Frames + + +
+ Estimating the 6D pose of textureless objects from RBG images is an important +problem in robotics. Due to appearance ambiguities, rotational symmetries, and +severe occlusions, single-view based 6D pose estimators are still unable to +handle a wide range of objects, motivating research towards multi-view pose +estimation and next-best-view prediction that addresses these limitations. In +this work, we propose a comprehensive active perception framework for +estimating the 6D poses of textureless objects using only RGB images. Our +approach is built upon a key idea: decoupling the 6D pose estimation into a +sequential two-step process can greatly improve both accuracy and efficiency. +First, we estimate the 3D translation of each object, resolving scale and depth +ambiguities inherent to RGB images. These estimates are then used to simplify +the subsequent task of determining the 3D orientation, which we achieve through +canonical scale template matching. Building on this formulation, we then +introduce an active perception strategy that predicts the next best camera +viewpoint to capture an RGB image, effectively reducing object pose uncertainty +and enhancing pose accuracy. We evaluate our method on the public ROBI dataset +as well as on a transparent object dataset that we created. When evaluated +using the same camera viewpoints, our multi-view pose estimation significantly +outperforms state-of-the-art approaches. Furthermore, by leveraging our +next-best-view strategy, our method achieves high object pose accuracy with +substantially fewer viewpoints than heuristic-based policies. + +
+
+
+
+
+ + ☆ Rethinking Video Tokenization: A Conditioned Diffusion-based Approach + + +
+ Video tokenizers, which transform videos into compact latent representations, +are key to video generation. Existing video tokenizers are based on the VAE +architecture and follow a paradigm where an encoder compresses videos into +compact latents, and a deterministic decoder reconstructs the original videos +from these latents. In this paper, we propose a novel +\underline{\textbf{C}}onditioned \underline{\textbf{D}}iffusion-based video +\underline{\textbf{T}}okenizer entitled \textbf{\ourmethod}, which departs from +previous methods by replacing the deterministic decoder with a 3D causal +diffusion model. The reverse diffusion generative process of the decoder is +conditioned on the latent representations derived via the encoder. With a +feature caching and sampling acceleration, the framework efficiently +reconstructs high-fidelity videos of arbitrary lengths. Results show that +{\ourmethod} achieves state-of-the-art performance in video reconstruction +tasks using just a single-step sampling. Even a smaller version of {\ourmethod} +still achieves reconstruction results on par with the top two baselines. +Furthermore, the latent video generation model trained using {\ourmethod} also +shows superior performance. + +
+
+
+
+
+ + ☆ DualDiff+: Dual-Branch Diffusion for High-Fidelity Video Generation with + Reward Guidance + + +
+ Accurate and high-fidelity driving scene reconstruction demands the effective +utilization of comprehensive scene information as conditional inputs. Existing +methods predominantly rely on 3D bounding boxes and BEV road maps for +foreground and background control, which fail to capture the full complexity of +driving scenes and adequately integrate multimodal information. In this work, +we present DualDiff, a dual-branch conditional diffusion model designed to +enhance driving scene generation across multiple views and video sequences. +Specifically, we introduce Occupancy Ray-shape Sampling (ORS) as a conditional +input, offering rich foreground and background semantics alongside 3D spatial +geometry to precisely control the generation of both elements. To improve the +synthesis of fine-grained foreground objects, particularly complex and distant +ones, we propose a Foreground-Aware Mask (FGM) denoising loss function. +Additionally, we develop the Semantic Fusion Attention (SFA) mechanism to +dynamically prioritize relevant information and suppress noise, enabling more +effective multimodal fusion. Finally, to ensure high-quality image-to-video +generation, we introduce the Reward-Guided Diffusion (RGD) framework, which +maintains global consistency and semantic coherence in generated videos. +Extensive experiments demonstrate that DualDiff achieves state-of-the-art +(SOTA) performance across multiple datasets. On the NuScenes dataset, DualDiff +reduces the FID score by 4.09% compared to the best baseline. In downstream +tasks, such as BEV segmentation, our method improves vehicle mIoU by 4.50% and +road mIoU by 1.70%, while in BEV 3D object detection, the foreground mAP +increases by 1.46%. Code will be made available at +https://github.com/yangzhaojason/DualDiff. + +
+
+
+
+
+ + ☆ A Generative Approach to High Fidelity 3D Reconstruction from Text Data + + +
+ The convergence of generative artificial intelligence and advanced computer +vision technologies introduces a groundbreaking approach to transforming +textual descriptions into three-dimensional representations. This research +proposes a fully automated pipeline that seamlessly integrates text-to-image +generation, various image processing techniques, and deep learning methods for +reflection removal and 3D reconstruction. By leveraging state-of-the-art +generative models like Stable Diffusion, the methodology translates natural +language inputs into detailed 3D models through a multi-stage workflow. + The reconstruction process begins with the generation of high-quality images +from textual prompts, followed by enhancement by a reinforcement learning agent +and reflection removal using the Stable Delight model. Advanced image upscaling +and background removal techniques are then applied to further enhance visual +fidelity. These refined two-dimensional representations are subsequently +transformed into volumetric 3D models using sophisticated machine learning +algorithms, capturing intricate spatial relationships and geometric +characteristics. This process achieves a highly structured and detailed output, +ensuring that the final 3D models reflect both semantic accuracy and geometric +precision. + This approach addresses key challenges in generative reconstruction, such as +maintaining semantic coherence, managing geometric complexity, and preserving +detailed visual information. Comprehensive experimental evaluations will assess +reconstruction quality, semantic accuracy, and geometric fidelity across +diverse domains and varying levels of complexity. By demonstrating the +potential of AI-driven 3D reconstruction techniques, this research offers +significant implications for fields such as augmented reality (AR), virtual +reality (VR), and digital content creation. + +
+
+
+
+
+ + ☆ LION-FS: Fast & Slow Video-Language Thinker as Online Video Assistant CVPR 2025 + + +
+ First-person video assistants are highly anticipated to enhance our daily +lives through online video dialogue. However, existing online video assistants +often sacrifice assistant efficacy for real-time efficiency by processing +low-frame-rate videos with coarse-grained visual features.To overcome the +trade-off between efficacy and efficiency, we propose "Fast & Slow +Video-Language Thinker" as an onLIne videO assistaNt, LION-FS, achieving +real-time, proactive, temporally accurate, and contextually precise responses. +LION-FS adopts a two-stage optimization strategy: 1)Fast Path: Routing-Based +Response Determination evaluates frame-by-frame whether an immediate response +is necessary. To enhance response determination accuracy and handle higher +frame-rate inputs efficiently, we employ Token Aggregation Routing to +dynamically fuse spatiotemporal features without increasing token numbers, +while utilizing Token Dropping Routing to eliminate redundant features. 2)Slow +Path: Multi-granularity Keyframe Augmentation optimizes keyframes during +response generation. To provide comprehensive and detailed responses beyond +atomic actions constrained by training data, fine-grained spatial features and +human-environment interaction features are extracted through multi-granular +pooling. These features are further integrated into a meticulously designed +multimodal Thinking Template to guide more precise response generation. +Comprehensive evaluations on online video tasks demonstrate that LION-FS +achieves state-of-the-art efficacy and efficiency. + +
+
+ comment: Accept to CVPR 2025 +
+
+
+
+
+ + ☆ Improving 6D Object Pose Estimation of metallic Household and Industry + Objects + + +
+ 6D object pose estimation suffers from reduced accuracy when applied to +metallic objects. We set out to improve the state-of-the-art by addressing +challenges such as reflections and specular highlights in industrial +applications. Our novel BOP-compatible dataset, featuring a diverse set of +metallic objects (cans, household, and industrial items) under various lighting +and background conditions, provides additional geometric and visual cues. We +demonstrate that these cues can be effectively leveraged to enhance overall +performance. To illustrate the usefulness of the additional features, we +improve upon the GDRNPP algorithm by introducing an additional keypoint +prediction and material estimator head in order to improve spatial scene +understanding. Evaluations on the new dataset show improved accuracy for +metallic objects, supporting the hypothesis that additional geometric and +visual cues can improve learning. + +
+
+
+
+
+ + ☆ DoraCycle: Domain-Oriented Adaptation of Unified Generative Model in + Multimodal Cycles CVPR 2025 + + +
+ Adapting generative models to specific domains presents an effective solution +for satisfying specialized requirements. However, adapting to some complex +domains remains challenging, especially when these domains require substantial +paired data to capture the targeted distributions. Since unpaired data from a +single modality, such as vision or language, is more readily available, we +utilize the bidirectional mappings between vision and language learned by the +unified generative model to enable training on unpaired data for domain +adaptation. Specifically, we propose DoraCycle, which integrates two multimodal +cycles: text-to-image-to-text and image-to-text-to-image. The model is +optimized through cross-entropy loss computed at the cycle endpoints, where +both endpoints share the same modality. This facilitates self-evolution of the +model without reliance on annotated text-image pairs. Experimental results +demonstrate that for tasks independent of paired knowledge, such as +stylization, DoraCycle can effectively adapt the unified model using only +unpaired data. For tasks involving new paired knowledge, such as specific +identities, a combination of a small set of paired image-text examples and +larger-scale unpaired data is sufficient for effective domain-oriented +adaptation. The code will be released at https://github.com/showlab/DoraCycle. + +
+
+ comment: CVPR 2025 +
+
+
+
+
+ + ☆ DongbaMIE: A Multimodal Information Extraction Dataset for Evaluating + Semantic Understanding of Dongba Pictograms + + +
+ Dongba pictographs are the only pictographs still in use in the world. They +have pictorial ideographic features, and their symbols carry rich cultural and +contextual information. Due to the lack of relevant datasets, existing research +has difficulty in advancing the study of semantic understanding of Dongba +pictographs. To this end, we propose DongbaMIE, the first multimodal dataset +for semantic understanding and extraction of Dongba pictographs. The dataset +consists of Dongba pictograph images and their corresponding Chinese semantic +annotations. It contains 23,530 sentence-level and 2,539 paragraph-level +images, covering four semantic dimensions: objects, actions, relations, and +attributes. We systematically evaluate the GPT-4o, Gemini-2.0, and Qwen2-VL +models. Experimental results show that the F1 scores of GPT-4o and Gemini in +the best object extraction are only 3.16 and 3.11 respectively. The F1 score of +Qwen2-VL after supervised fine-tuning is only 11.49. These results suggest that +current large multimodal models still face significant challenges in accurately +recognizing the diverse semantic information in Dongba pictographs. The dataset +can be obtained from this URL. + +
+
+
+
+
+ + ☆ An Adaptive Underwater Image Enhancement Framework via Multi-Domain + Fusion and Color Compensation + + +
+ Underwater optical imaging is severely degraded by light absorption, +scattering, and color distortion, hindering visibility and accurate image +analysis. This paper presents an adaptive enhancement framework integrating +illumination compensation, multi-domain filtering, and dynamic color +correction. A hybrid illumination compensation strategy combining CLAHE, Gamma +correction, and Retinex enhances visibility. A two-stage filtering process, +including spatial-domain (Gaussian, Bilateral, Guided) and frequency-domain +(Fourier, Wavelet) methods, effectively reduces noise while preserving details. +To correct color distortion, an adaptive color compensation (ACC) model +estimates spectral attenuation and water type to combine RCP, DCP, and MUDCP +dynamically. Finally, a perceptually guided color balance mechanism ensures +natural color restoration. Experimental results on benchmark datasets +demonstrate superior performance over state-of-the-art methods in contrast +enhancement, color correction, and structural preservation, making the +framework robust for underwater imaging applications. + +
+
+
+
+
+ + ☆ 4D Radar Ground Truth Augmentation with LiDAR-to-4D Radar Data Synthesis + + +
+ Ground truth augmentation (GT-Aug) is a common method for LiDAR-based object +detection, as it enhances object density by leveraging ground truth bounding +boxes (GT bboxes). However, directly applying GT-Aug to 4D Radar tensor data +overlooks important measurements outside the GT bboxes-such as +sidelobes-leading to synthetic distributions that deviate from real-world 4D +Radar data. To address this limitation, we propose 4D Radar Ground Truth +Augmentation (4DR GT-Aug). Our approach first augments LiDAR data and then +converts it to 4D Radar data via a LiDAR-to-4D Radar data synthesis (L2RDaS) +module, which explicitly accounts for measurements both inside and outside GT +bboxes. In doing so, it produces 4D Radar data distributions that more closely +resemble real-world measurements, thereby improving object detection accuracy. +Experiments on the K-Radar dataset show that the proposed method achieves +improved performance compared to conventional GT-Aug in object detection for 4D +Radar. The implementation code is available at +https://github.com/kaist-avelab/K-Radar. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ CLIP is Strong Enough to Fight Back: Test-time Counterattacks towards + Zero-shot Adversarial Robustness of CLIP CVPR 2025 + + +
+ Despite its prevalent use in image-text matching tasks in a zero-shot manner, +CLIP has been shown to be highly vulnerable to adversarial perturbations added +onto images. Recent studies propose to finetune the vision encoder of CLIP with +adversarial samples generated on the fly, and show improved robustness against +adversarial attacks on a spectrum of downstream datasets, a property termed as +zero-shot robustness. In this paper, we show that malicious perturbations that +seek to maximise the classification loss lead to `falsely stable' images, and +propose to leverage the pre-trained vision encoder of CLIP to counterattack +such adversarial images during inference to achieve robustness. Our paradigm is +simple and training-free, providing the first method to defend CLIP from +adversarial attacks at test time, which is orthogonal to existing methods +aiming to boost zero-shot adversarial robustness of CLIP. We conduct +experiments across 16 classification datasets, and demonstrate stable and +consistent gains compared to test-time defence methods adapted from existing +adversarial robustness studies that do not rely on external networks, without +noticeably impairing performance on clean images. We also show that our +paradigm can be employed on CLIP models that have been adversarially finetuned +to further enhance their robustness at test time. Our code is available +\href{https://github.com/Sxing2/CLIP-Test-time-Counterattacks}{here}. + +
+
+ comment: Accepted to CVPR 2025 +
+
+
+
+
+ + ☆ REGRACE: A Robust and Efficient Graph-based Re-localization Algorithm + using Consistency Evaluation IROS2025 + + +
+ Loop closures are essential for correcting odometry drift and creating +consistent maps, especially in the context of large-scale navigation. Current +methods using dense point clouds for accurate place recognition do not scale +well due to computationally expensive scan-to-scan comparisons. Alternative +object-centric approaches are more efficient but often struggle with +sensitivity to viewpoint variation. In this work, we introduce REGRACE, a novel +approach that addresses these challenges of scalability and perspective +difference in re-localization by using LiDAR-based submaps. We introduce +rotation-invariant features for each labeled object and enhance them with +neighborhood context through a graph neural network. To identify potential +revisits, we employ a scalable bag-of-words approach, pooling one learned +global feature per submap. Additionally, we define a revisit with geometrical +consistency cues rather than embedding distance, allowing us to recognize +far-away loop closures. Our evaluations demonstrate that REGRACE achieves +similar results compared to state-of-the-art place recognition and registration +baselines while being twice as fast. + +
+
+ comment: Submitted to IROS2025 +
+
+
+
+
+ + ☆ Towards Visual Discrimination and Reasoning of Real-World Physical + Dynamics: Physics-Grounded Anomaly Detection CVPR 2025 + + +
+ Humans detect real-world object anomalies by perceiving, interacting, and +reasoning based on object-conditioned physical knowledge. The long-term goal of +Industrial Anomaly Detection (IAD) is to enable machines to autonomously +replicate this skill. However, current IAD algorithms are largely developed and +tested on static, semantically simple datasets, which diverge from real-world +scenarios where physical understanding and reasoning are essential.To bridge +this gap, we introduce the Physics Anomaly Detection (Phys-AD) dataset, the +first large-scale, real-world, physics-grounded video dataset for industrial +anomaly detection. Collected using a real robot arm and motor, Phys-AD provides +a diverse set of dynamic, semantically rich scenarios. The dataset includes +more than 6400 videos across 22 real-world object categories, interacting with +robot arms and motors, and exhibits 47 types of anomalies. Anomaly detection in +Phys-AD requires visual reasoning, combining both physical knowledge and video +content to determine object abnormality.We benchmark state-of-the-art anomaly +detection methods under three settings: unsupervised AD, weakly-supervised AD, +and video-understanding AD, highlighting their limitations in handling +physics-grounded anomalies. Additionally, we introduce the Physics Anomaly +Explanation (PAEval) metric, designed to assess the ability of visual-language +foundation models to not only detect anomalies but also provide accurate +explanations for their underlying physical causes. Our dataset and benchmark +will be publicly available. + +
+
+ comment: Accepted by CVPR 2025 +
+
+
+
+
+ + ☆ High-Quality Virtual Single-Viewpoint Surgical Video: Geometric + Autocalibration of Multiple Cameras in Surgical Lights MICCAI2023 + + +
+ Occlusion-free video generation is challenging due to surgeons' obstructions +in the camera field of view. Prior work has addressed this issue by installing +multiple cameras on a surgical light, hoping some cameras will observe the +surgical field with less occlusion. However, this special camera setup poses a +new imaging challenge since camera configurations can change every time +surgeons move the light, and manual image alignment is required. This paper +proposes an algorithm to automate this alignment task. The proposed method +detects frames where the lighting system moves, realigns them, and selects the +camera with the least occlusion. This algorithm results in a stabilized video +with less occlusion. Quantitative results show that our method outperforms +conventional approaches. A user study involving medical doctors also confirmed +the superiority of our method. + +
+
+ comment: Accepted at MICCAI2023 +
+
+
+
+
+ + ☆ Afford-X: Generalizable and Slim Affordance Reasoning for Task-oriented + Manipulation + + +
+ Object affordance reasoning, the ability to infer object functionalities +based on physical properties, is fundamental for task-oriented planning and +activities in both humans and Artificial Intelligence (AI). This capability, +required for planning and executing daily activities in a task-oriented manner, +relies on commonsense knowledge of object physics and functionalities, +extending beyond simple object recognition. Current computational models for +affordance reasoning from perception lack generalizability, limiting their +applicability in novel scenarios. Meanwhile, comprehensive Large Language +Models (LLMs) with emerging reasoning capabilities are challenging to deploy on +local devices for task-oriented manipulations. Here, we introduce LVIS-Aff, a +large-scale dataset comprising 1,496 tasks and 119k images, designed to enhance +the generalizability of affordance reasoning from perception. Utilizing this +dataset, we develop Afford-X, an end-to-end trainable affordance reasoning +model that incorporates Verb Attention and Bi-Fusion modules to improve +multi-modal understanding. This model achieves up to a 12.1% performance +improvement over the best-reported results from non-LLM methods, while also +demonstrating a 1.2% enhancement compared to our previous conference paper. +Additionally, it maintains a compact 187M parameter size and infers nearly 50 +times faster than the GPT-4V API. Our work demonstrates the potential for +efficient, generalizable affordance reasoning models that can be deployed on +local devices for task-oriented manipulations. We showcase Afford-X's +effectiveness in enabling task-oriented manipulations for robots across various +tasks and environments, underscoring its efficiency and broad implications for +advancing robotics and AI systems in real-world applications. + +
+
+
+
+
+ + ☆ Simulation-Based Performance Evaluation of 3D Object Detection Methods + with Deep Learning for a LiDAR Point Cloud Dataset in a SOTIF-related Use + Case + + +
+ Safety of the Intended Functionality (SOTIF) addresses sensor performance +limitations and deep learning-based object detection insufficiencies to ensure +the intended functionality of Automated Driving Systems (ADS). This paper +presents a methodology examining the adaptability and performance evaluation of +the 3D object detection methods on a LiDAR point cloud dataset generated by +simulating a SOTIF-related Use Case. The major contributions of this paper +include defining and modelling a SOTIF-related Use Case with 21 diverse weather +conditions and generating a LiDAR point cloud dataset suitable for application +of 3D object detection methods. The dataset consists of 547 frames, +encompassing clear, cloudy, rainy weather conditions, corresponding to +different times of the day, including noon, sunset, and night. Employing +MMDetection3D and OpenPCDET toolkits, the performance of State-of-the-Art +(SOTA) 3D object detection methods is evaluated and compared by testing the +pre-trained Deep Learning (DL) models on the generated dataset using Average +Precision (AP) and Recall metrics. + +
+
+
+
+
+ + ☆ A self-supervised cyclic neural-analytic approach for novel view + synthesis and 3D reconstruction BMVC 2024 + + +
+ Generating novel views from recorded videos is crucial for enabling +autonomous UAV navigation. Recent advancements in neural rendering have +facilitated the rapid development of methods capable of rendering new +trajectories. However, these methods often fail to generalize well to regions +far from the training data without an optimized flight path, leading to +suboptimal reconstructions. We propose a self-supervised cyclic neural-analytic +pipeline that combines high-quality neural rendering outputs with precise +geometric insights from analytical methods. Our solution improves RGB and mesh +reconstructions for novel view synthesis, especially in undersampled areas and +regions that are completely different from the training dataset. We use an +effective transformer-based architecture for image reconstruction to refine and +adapt the synthesis process, enabling effective handling of novel, unseen poses +without relying on extensive labeled datasets. Our findings demonstrate +substantial improvements in rendering views of novel and also 3D +reconstruction, which to the best of our knowledge is a first, setting a new +standard for autonomous navigation in complex outdoor environments. + +
+
+ comment: Published in BMVC 2024, 10 pages, 4 figures +
+
+
+
+
+ + ☆ Unified Human Localization and Trajectory Prediction with Monocular + Vision ICRA 2025 + + +
+ Conventional human trajectory prediction models rely on clean curated data, +requiring specialized equipment or manual labeling, which is often impractical +for robotic applications. The existing predictors tend to overfit to clean +observation affecting their robustness when used with noisy inputs. In this +work, we propose MonoTransmotion (MT), a Transformer-based framework that uses +only a monocular camera to jointly solve localization and prediction tasks. Our +framework has two main modules: Bird's Eye View (BEV) localization and +trajectory prediction. The BEV localization module estimates the position of a +person using 2D human poses, enhanced by a novel directional loss for smoother +sequential localizations. The trajectory prediction module predicts future +motion from these estimates. We show that by jointly training both tasks with +our unified framework, our method is more robust in real-world scenarios made +of noisy inputs. We validate our MT network on both curated and non-curated +datasets. On the curated dataset, MT achieves around 12% improvement over +baseline models on BEV localization and trajectory prediction. On real-world +non-curated dataset, experimental results indicate that MT maintains similar +performance levels, highlighting its robustness and generalization capability. +The code is available at https://github.com/vita-epfl/MonoTransmotion. + +
+
+ comment: ICRA 2025 +
+
+
+
+
+ + ☆ AdaSin: Enhancing Hard Sample Metrics with Dual Adaptive Penalty for + Face Recognition + + +
+ In recent years, the emergence of deep convolutional neural networks has +positioned face recognition as a prominent research focus in computer vision. +Traditional loss functions, such as margin-based, hard-sample mining-based, and +hybrid approaches, have achieved notable performance improvements, with some +leveraging curriculum learning to optimize training. However, these methods +often fall short in effectively quantifying the difficulty of hard samples. To +address this, we propose Adaptive Sine (AdaSin) loss function, which introduces +the sine of the angle between a sample's embedding feature and its ground-truth +class center as a novel difficulty metric. This metric enables precise and +effective penalization of hard samples. By incorporating curriculum learning, +the model dynamically adjusts classification boundaries across different +training stages. Unlike previous adaptive-margin loss functions, AdaSin +introduce a dual adaptive penalty, applied to both the positive and negative +cosine similarities of hard samples. This design imposes stronger constraints, +enhancing intra-class compactness and inter-class separability. The combination +of the dual adaptive penalty and curriculum learning is guided by a +well-designed difficulty metric. It enables the model to focus more effectively +on hard samples in later training stages, and lead to the extraction of highly +discriminative face features. Extensive experiments across eight benchmarks +demonstrate that AdaSin achieves superior accuracy compared to other +state-of-the-art methods. + +
+
+
+
+
+ + ☆ Do ImageNet-trained models learn shortcuts? The impact of frequency + shortcuts on generalization CVPR2025 + + +
+ Frequency shortcuts refer to specific frequency patterns that models heavily +rely on for correct classification. Previous studies have shown that models +trained on small image datasets often exploit such shortcuts, potentially +impairing their generalization performance. However, existing methods for +identifying frequency shortcuts require expensive computations and become +impractical for analyzing models trained on large datasets. In this work, we +propose the first approach to more efficiently analyze frequency shortcuts at a +larger scale. We show that both CNN and transformer models learn frequency +shortcuts on ImageNet. We also expose that frequency shortcut solutions can +yield good performance on out-of-distribution (OOD) test sets which largely +retain texture information. However, these shortcuts, mostly aligned with +texture patterns, hinder model generalization on rendition-based OOD test sets. +These observations suggest that current OOD evaluations often overlook the +impact of frequency shortcuts on model generalization. Future benchmarks could +thus benefit from explicitly assessing and accounting for these shortcuts to +build models that generalize across a broader range of OOD scenarios. + +
+
+ comment: received at CVPR2025 +
+
+
+
+
+ + ☆ Mineral segmentation using electron microscope images and spectral + sampling through multimodal graph neural networks + + +
+ We propose a novel Graph Neural Network-based method for segmentation based +on data fusion of multimodal Scanning Electron Microscope (SEM) images. In most +cases, Backscattered Electron (BSE) images obtained using SEM do not contain +sufficient information for mineral segmentation. Therefore, imaging is often +complemented with point-wise Energy-Dispersive X-ray Spectroscopy (EDS) +spectral measurements that provide highly accurate information about the +chemical composition but that are time-consuming to acquire. This motivates the +use of sparse spectral data in conjunction with BSE images for mineral +segmentation. The unstructured nature of the spectral data makes most +traditional image fusion techniques unsuitable for BSE-EDS fusion. We propose +using graph neural networks to fuse the two modalities and segment the mineral +phases simultaneously. Our results demonstrate that providing EDS data for as +few as 1% of BSE pixels produces accurate segmentation, enabling rapid analysis +of mineral samples. The proposed data fusion pipeline is versatile and can be +adapted to other domains that involve image data and point-wise measurements. + +
+
+
+
+
+ + ☆ CarGait: Cross-Attention based Re-ranking for Gait recognition + + +
+ Gait recognition is a computer vision task that identifies individuals based +on their walking patterns. Gait recognition performance is commonly evaluated +by ranking a gallery of candidates and measuring the accuracy at the top +Rank-$K$. Existing models are typically single-staged, i.e. searching for the +probe's nearest neighbors in a gallery using a single global feature +representation. Although these models typically excel at retrieving the correct +identity within the top-$K$ predictions, they struggle when hard negatives +appear in the top short-list, leading to relatively low performance at the +highest ranks (e.g., Rank-1). In this paper, we introduce CarGait, a +Cross-Attention Re-ranking method for gait recognition, that involves +re-ordering the top-$K$ list leveraging the fine-grained correlations between +pairs of gait sequences through cross-attention between gait strips. This +re-ranking scheme can be adapted to existing single-stage models to enhance +their final results. We demonstrate the capabilities of CarGait by extensive +experiments on three common gait datasets, Gait3D, GREW, and OU-MVLP, and seven +different gait models, showing consistent improvements in Rank-1,5 accuracy, +superior results over existing re-ranking methods, and strong baselines. + +
+
+
+
+
+ + ☆ Find First, Track Next: Decoupling Identification and Propagation in + Referring Video Object Segmentation + + +
+ Referring video object segmentation aims to segment and track a target object +in a video using a natural language prompt. Existing methods typically fuse +visual and textual features in a highly entangled manner, processing +multi-modal information together to generate per-frame masks. However, this +approach often struggles with ambiguous target identification, particularly in +scenes with multiple similar objects, and fails to ensure consistent mask +propagation across frames. To address these limitations, we introduce +FindTrack, a novel decoupled framework that separates target identification +from mask propagation. FindTrack first adaptively selects a key frame by +balancing segmentation confidence and vision-text alignment, establishing a +robust reference for the target object. This reference is then utilized by a +dedicated propagation module to track and segment the object across the entire +video. By decoupling these processes, FindTrack effectively reduces ambiguities +in target association and enhances segmentation consistency. We demonstrate +that FindTrack outperforms existing methods on public benchmarks. + +
+
+
+
+
+ + ☆ Feature Point Extraction for Extra-Affine Image + + +
+ The issue concerning the significant decline in the stability of feature +extraction for images subjected to large-angle affine transformations, where +the angle exceeds 50 degrees, still awaits a satisfactory solution. Even ASIFT, +which is built upon SIFT and entails a considerable number of image comparisons +simulated by affine transformations, inevitably exhibits the drawbacks of being +time-consuming and imposing high demands on memory usage. And the stability of +feature extraction drops rapidly under large-view affine transformations. +Consequently, we propose a method that represents an improvement over ASIFT. On +the premise of improving the precision and maintaining the affine invariance, +it currently ranks as the fastest feature extraction method for extra-affine +images that we know of at present. Simultaneously, the stability of feature +extraction regarding affine transformation images has been approximated to the +maximum limits. Both the angle between the shooting direction and the normal +direction of the photographed object (absolute tilt angle), and the shooting +transformation angle between two images (transition tilt angle) are close to 90 +degrees. The central idea of the method lies in obtaining the optimal parameter +set by simulating affine transformation with the reference image. And the +simulated affine transformation is reproduced by combining it with the Lanczos +interpolation based on the optimal parameter set. Subsequently, it is combined +with ORB, which exhibits excellent real-time performance for rapid orientation +binary description. Moreover, a scale parameter simulation is introduced to +further augment the operational efficiency. + +
+
+
+
+
+ + ☆ Bridging Synthetic-to-Real Gaps: Frequency-Aware Perturbation and + Selection for Single-shot Multi-Parametric Mapping Reconstruction + + +
+ Data-centric artificial intelligence (AI) has remarkably advanced medical +imaging, with emerging methods using synthetic data to address data scarcity +while introducing synthetic-to-real gaps. Unsupervised domain adaptation (UDA) +shows promise in ground truth-scarce tasks, but its application in +reconstruction remains underexplored. Although multiple overlapping-echo +detachment (MOLED) achieves ultra-fast multi-parametric reconstruction, +extending its application to various clinical scenarios, the quality suffers +from deficiency in mitigating the domain gap, difficulty in maintaining +structural integrity, and inadequacy in ensuring mapping accuracy. To resolve +these issues, we proposed frequency-aware perturbation and selection (FPS), +comprising Wasserstein distance-modulated frequency-aware perturbation (WDFP) +and hierarchical frequency-aware selection network (HFSNet), which integrates +frequency-aware adaptive selection (FAS), compact FAS (cFAS) and feature-aware +architecture integration (FAI). Specifically, perturbation activates +domain-invariant feature learning within uncertainty, while selection refines +optimal solutions within perturbation, establishing a robust and closed-loop +learning pathway. Extensive experiments on synthetic data, along with diverse +real clinical cases from 5 healthy volunteers, 94 ischemic stroke patients, and +46 meningioma patients, demonstrate the superiority and clinical applicability +of FPS. Furthermore, FPS is applied to diffusion tensor imaging (DTI), +underscoring its versatility and potential for broader medical applications. +The code is available at https://github.com/flyannie/FPS. + +
+
+ comment: This work will be submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ DTU-Net: A Multi-Scale Dilated Transformer Network for Nonlinear + Hyperspectral Unmixing + + +
+ Transformers have shown significant success in hyperspectral unmixing (HU). +However, challenges remain. While multi-scale and long-range spatial +correlations are essential in unmixing tasks, current Transformer-based +unmixing networks, built on Vision Transformer (ViT) or Swin-Transformer, +struggle to capture them effectively. Additionally, current Transformer-based +unmixing networks rely on the linear mixing model, which lacks the flexibility +to accommodate scenarios where nonlinear effects are significant. To address +these limitations, we propose a multi-scale Dilated Transformer-based unmixing +network for nonlinear HU (DTU-Net). The encoder employs two branches. The first +one performs multi-scale spatial feature extraction using Multi-Scale Dilated +Attention (MSDA) in the Dilated Transformer, which varies dilation rates across +attention heads to capture long-range and multi-scale spatial correlations. The +second one performs spectral feature extraction utilizing 3D-CNNs with channel +attention. The outputs from both branches are then fused to integrate +multi-scale spatial and spectral information, which is subsequently transformed +to estimate the abundances. The decoder is designed to accommodate both linear +and nonlinear mixing scenarios. Its interpretability is enhanced by explicitly +modeling the relationships between endmembers, abundances, and nonlinear +coefficients in accordance with the polynomial post-nonlinear mixing model +(PPNMM). Experiments on synthetic and real datasets validate the effectiveness +of the proposed DTU-Net compared to PPNMM-derived methods and several advanced +unmixing networks. + +
+
+
+
+
+ + ☆ Active Learning for Deep Learning-Based Hemodynamic Parameter Estimation + + +
+ Hemodynamic parameters such as pressure and wall shear stress play an +important role in diagnosis, prognosis, and treatment planning in +cardiovascular diseases. These parameters can be accurately computed using +computational fluid dynamics (CFD), but CFD is computationally intensive. +Hence, deep learning methods have been adopted as a surrogate to rapidly +estimate CFD outcomes. A drawback of such data-driven models is the need for +time-consuming reference CFD simulations for training. In this work, we +introduce an active learning framework to reduce the number of CFD simulations +required for the training of surrogate models, lowering the barriers to their +deployment in new applications. We propose three distinct querying strategies +to determine for which unlabeled samples CFD simulations should be obtained. +These querying strategies are based on geometrical variance, ensemble +uncertainty, and adherence to the physics governing fluid dynamics. We +benchmark these methods on velocity field estimation in synthetic coronary +artery bifurcations and find that they allow for substantial reductions in +annotation cost. Notably, we find that our strategies reduce the number of +samples required by up to 50% and make the trained models more robust to +difficult cases. Our results show that active learning is a feasible strategy +to increase the potential of deep learning-based CFD surrogates. + +
+
+
+
+
+ + ☆ Biased Heritage: How Datasets Shape Models in Facial Expression + Recognition + + +
+ In recent years, the rapid development of artificial intelligence (AI) +systems has raised concerns about our ability to ensure their fairness, that +is, how to avoid discrimination based on protected characteristics such as +gender, race, or age. While algorithmic fairness is well-studied in simple +binary classification tasks on tabular data, its application to complex, +real-world scenarios-such as Facial Expression Recognition (FER)-remains +underexplored. FER presents unique challenges: it is inherently multiclass, and +biases emerge across intersecting demographic variables, each potentially +comprising multiple protected groups. We present a comprehensive framework to +analyze bias propagation from datasets to trained models in image-based FER +systems, while introducing new bias metrics specifically designed for +multiclass problems with multiple demographic groups. Our methodology studies +bias propagation by (1) inducing controlled biases in FER datasets, (2) +training models on these biased datasets, and (3) analyzing the correlation +between dataset bias metrics and model fairness notions. Our findings reveal +that stereotypical biases propagate more strongly to model predictions than +representational biases, suggesting that preventing emotion-specific +demographic patterns should be prioritized over general demographic balance in +FER datasets. Additionally, we observe that biased datasets lead to reduced +model accuracy, challenging the assumed fairness-accuracy trade-off. + +
+
+ comment: 17 pages, 7 figures +
+
+
+
+
+ + ☆ JamMa: Ultra-lightweight Local Feature Matching with Joint Mamba CVPR 2025 + + +
+ Existing state-of-the-art feature matchers capture long-range dependencies +with Transformers but are hindered by high spatial complexity, leading to +demanding training and highlatency inference. Striking a better balance between +performance and efficiency remains a challenge in feature matching. Inspired by +the linear complexity O(N) of Mamba, we propose an ultra-lightweight +Mamba-based matcher, named JamMa, which converges on a single GPU and achieves +an impressive performance-efficiency balance in inference. To unlock the +potential of Mamba for feature matching, we propose Joint Mamba with a +scan-merge strategy named JEGO, which enables: (1) Joint scan of two images to +achieve high-frequency mutual interaction, (2) Efficient scan with skip steps +to reduce sequence length, (3) Global receptive field, and (4) Omnidirectional +feature representation. With the above properties, the JEGO strategy +significantly outperforms the scan-merge strategies proposed in VMamba and +EVMamba in the feature matching task. Compared to attention-based sparse and +semi-dense matchers, JamMa demonstrates a superior balance between performance +and efficiency, delivering better performance with less than 50% of the +parameters and FLOPs. + +
+
+ comment: CVPR 2025, Project page: https://leoluxxx.github.io/JamMa-page/ +
+
+
+
+
+ + ☆ CoSDH: Communication-Efficient Collaborative Perception via + Supply-Demand Awareness and Intermediate-Late Hybridization CVPR 2025 + + +
+ Multi-agent collaborative perception enhances perceptual capabilities by +utilizing information from multiple agents and is considered a fundamental +solution to the problem of weak single-vehicle perception in autonomous +driving. However, existing collaborative perception methods face a dilemma +between communication efficiency and perception accuracy. To address this +issue, we propose a novel communication-efficient collaborative perception +framework based on supply-demand awareness and intermediate-late hybridization, +dubbed as \mymethodname. By modeling the supply-demand relationship between +agents, the framework refines the selection of collaboration regions, reducing +unnecessary communication cost while maintaining accuracy. In addition, we +innovatively introduce the intermediate-late hybrid collaboration mode, where +late-stage collaboration compensates for the performance degradation in +collaborative perception under low communication bandwidth. Extensive +experiments on multiple datasets, including both simulated and real-world +scenarios, demonstrate that \mymethodname~ achieves state-of-the-art detection +accuracy and optimal bandwidth trade-offs, delivering superior detection +precision under real communication bandwidths, thus proving its effectiveness +and practical applicability. The code will be released at +https://github.com/Xu2729/CoSDH. + +
+
+ comment: Accepted at CVPR 2025 +
+
+
+
+
+ + ☆ Automatic Drywall Analysis for Progress Tracking and Quality Control in + Construction + + +
+ Digitalization in the construction industry has become essential, enabling +centralized, easy access to all relevant information of a building. Automated +systems can facilitate the timely and resource-efficient documentation of +changes, which is crucial for key processes such as progress tracking and +quality control. This paper presents a method for image-based automated drywall +analysis enabling construction progress and quality assessment through on-site +camera systems. Our proposed solution integrates a deep learning-based instance +segmentation model to detect and classify various drywall elements with an +analysis module to cluster individual wall segments, estimate camera +perspective distortions, and apply the corresponding corrections. This system +extracts valuable information from images, enabling more accurate progress +tracking and quality assessment on construction sites. Our main contributions +include a fully automated pipeline for drywall analysis, improving instance +segmentation accuracy through architecture modifications and targeted data +augmentation, and a novel algorithm to extract important information from the +segmentation results. Our modified model, enhanced with data augmentation, +achieves significantly higher accuracy compared to other architectures, +offering more detailed and precise information than existing approaches. +Combined with the proposed drywall analysis steps, it enables the reliable +automation of construction progress and quality assessment. + +
+
+
+
+
+ + ☆ Augmentation-Based Deep Learning for Identification of Circulating Tumor + Cells + + +
+ Circulating tumor cells (CTCs) are crucial biomarkers in liquid biopsy, +offering a noninvasive tool for cancer patient management. However, their +identification remains particularly challenging due to their limited number and +heterogeneity. Labeling samples for contrast limits the generalization of +fluorescence-based methods across different hospital datasets. Analyzing +single-cell images enables detailed assessment of cell morphology, subcellular +structures, and phenotypic variations, often hidden in clustered images. +Developing a method based on bright-field single-cell analysis could overcome +these limitations. CTCs can be isolated using an unbiased workflow combining +Parsortix technology, which selects cells based on size and deformability, with +DEPArray technology, enabling precise visualization and selection of single +cells. Traditionally, DEPArray-acquired digital images are manually analyzed, +making the process time-consuming and prone to variability. In this study, we +present a Deep Learning-based classification pipeline designed to distinguish +CTCs from leukocytes in blood samples, aimed to enhance diagnostic accuracy and +optimize clinical workflows. Our approach employs images from the bright-field +channel acquired through DEPArray technology leveraging a ResNet-based CNN. To +improve model generalization, we applied three types of data augmentation +techniques and incorporated fluorescence (DAPI) channel images into the +training phase, allowing the network to learn additional CTC-specific features. +Notably, only bright-field images have been used for testing, ensuring the +model's ability to identify CTCs without relying on fluorescence markers. The +proposed model achieved an F1-score of 0.798, demonstrating its capability to +distinguish CTCs from leukocytes. These findings highlight the potential of DL +in refining CTC analysis and advancing liquid biopsy applications. + +
+
+ comment: 20 pages, 4 figures, 3 tables +
+
+
+
+
+ + ☆ AI-Driven Multi-Stage Computer Vision System for Defect Detection in + Laser-Engraved Industrial Nameplates + + +
+ Automated defect detection in industrial manufacturing is essential for +maintaining product quality and minimizing production errors. In air disc brake +manufacturing, ensuring the precision of laser-engraved nameplates is crucial +for accurate product identification and quality control. Engraving errors, such +as misprints or missing characters, can compromise both aesthetics and +functionality, leading to material waste and production delays. This paper +presents a proof of concept for an AI-driven computer vision system that +inspects and verifies laser-engraved nameplates, detecting defects in logos and +alphanumeric strings. The system integrates object detection using YOLOv7, +optical character recognition (OCR) with Tesseract, and anomaly detection +through a residual variational autoencoder (ResVAE) along with other computer +vision methods to enable comprehensive inspections at multiple stages. +Experimental results demonstrate the system's effectiveness, achieving 91.33% +accuracy and 100% recall, ensuring that defective nameplates are consistently +detected and addressed. This solution highlights the potential of AI-driven +visual inspection to enhance quality control, reduce manual inspection efforts, +and improve overall manufacturing efficiency. + +
+
+
+
+
+ + ☆ MIAdapt: Source-free Few-shot Domain Adaptive Object Detection for + Microscopic Images + + +
+ Existing generic unsupervised domain adaptation approaches require access to +both a large labeled source dataset and a sufficient unlabeled target dataset +during adaptation. However, collecting a large dataset, even if unlabeled, is a +challenging and expensive endeavor, especially in medical imaging. In addition, +constraints such as privacy issues can result in cases where source data is +unavailable. Taking in consideration these challenges, we propose MIAdapt, an +adaptive approach for Microscopic Imagery Adaptation as a solution for +Source-free Few-shot Domain Adaptive Object detection (SF-FSDA). We also define +two competitive baselines (1) Faster-FreeShot and (2) MT-FreeShot. Extensive +experiments on the challenging M5-Malaria and Raabin-WBC datasets validate the +effectiveness of MIAdapt. Without using any image from the source domain +MIAdapt surpasses state-of-the-art source-free UDA (SF-UDA) methods by +21.3% +mAP and few-shot domain adaptation (FSDA) approaches by +4.7% mAP on +Raabin-WBC. Our code and models will be publicly available. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Top-K Maximum Intensity Projection Priors for 3D Liver Vessel + Segmentation + + +
+ Liver-vessel segmentation is an essential task in the pre-operative planning +of liver resection. State-of-the-art 2D or 3D convolution-based methods +focusing on liver vessel segmentation on 2D CT cross-sectional views, which do +not take into account the global liver-vessel topology. To maintain this global +vessel topology, we rely on the underlying physics used in the CT +reconstruction process, and apply this to liver-vessel segmentation. +Concretely, we introduce the concept of top-k maximum intensity projections, +which mimics the CT reconstruction by replacing the integral along each +projection direction, with keeping the top-k maxima along each projection +direction. We use these top-k maximum projections to condition a diffusion +model and generate 3D liver-vessel trees. We evaluate our 3D liver-vessel +segmentation on the 3D-ircadb-01 dataset, and achieve the highest Dice +coefficient, intersection-over-union (IoU), and Sensitivity scores compared to +prior work. + +
+
+ comment: Accepted in 2025 IEEE International Symposium on Biomedical Imaging + (ISBI 2025) +
+
+
+
+
+ + ☆ TopoMortar: A dataset to evaluate image segmentation methods focused on + topology accuracy + + +
+ We present TopoMortar, a brick wall dataset that is the first dataset +specifically designed to evaluate topology-focused image segmentation methods, +such as topology loss functions. TopoMortar enables to investigate in two ways +whether methods incorporate prior topological knowledge. First, by eliminating +challenges seen in real-world data, such as small training set, noisy labels, +and out-of-distribution test-set images, that, as we show, impact the +effectiveness of topology losses. Second, by allowing to assess in the same +dataset topology accuracy across dataset challenges, isolating dataset-related +effects from the effect of incorporating prior topological knowledge. In these +two experiments, it is deliberately difficult to improve topology accuracy +without actually using topology information, thus, permitting to attribute an +improvement in topology accuracy to the incorporation of prior topological +knowledge. To this end, TopoMortar includes three types of labels (accurate, +noisy, pseudo-labels), two fixed training sets (large and small), and +in-distribution and out-of-distribution test-set images. We compared eight loss +functions on TopoMortar, and we found that clDice achieved the most +topologically accurate segmentations, Skeleton Recall loss performed best +particularly with noisy labels, and the relative advantageousness of the other +loss functions depended on the experimental setting. Additionally, we show that +simple methods, such as data augmentation and self-distillation, can elevate +Cross entropy Dice loss to surpass most topology loss functions, and that those +simple methods can enhance topology loss functions as well. clDice and Skeleton +Recall loss, both skeletonization-based loss functions, were also the fastest +to train, making this type of loss function a promising research direction. +TopoMortar and our code can be found at https://github.com/jmlipman/TopoMortar + +
+
+
+
+
+ + ☆ Video Super-Resolution: All You Need is a Video Diffusion Model + + +
+ We present a generic video super-resolution algorithm in this paper, based on +the Diffusion Posterior Sampling framework with an unconditional video +generation model in latent space. The video generation model, a diffusion +transformer, functions as a space-time model. We argue that a powerful model, +which learns the physics of the real world, can easily handle various kinds of +motion patterns as prior knowledge, thus eliminating the need for explicit +estimation of optical flows or motion parameters for pixel alignment. +Furthermore, a single instance of the proposed video diffusion transformer +model can adapt to different sampling conditions without re-training. Due to +limited computational resources and training data, our experiments provide +empirical evidence of the algorithm's strong super-resolution capabilities +using synthetic data. + +
+
+
+
+
+ + ☆ Automated Attendee Recognition System for Large-Scale Social Events or + Conference Gathering + + +
+ Manual attendance tracking at large-scale events, such as marriage functions +or conferences, is often inefficient and prone to human error. To address this +challenge, we propose an automated, cloud-based attendance tracking system that +uses cameras mounted at the entrance and exit gates. The mounted cameras +continuously capture video and send the video data to cloud services to perform +real-time face detection and recognition. Unlike existing solutions, our system +accurately identifies attendees even when they are not looking directly at the +camera, allowing natural movements, such as looking around or talking while +walking. To the best of our knowledge, this is the first system to achieve high +recognition rates under such dynamic conditions. Our system demonstrates +overall 90% accuracy, with each video frame processed in 5 seconds, ensuring +real time operation without frame loss. In addition, notifications are sent +promptly to security personnel within the same latency. This system achieves +100% accuracy for individuals without facial obstructions and successfully +recognizes all attendees appearing within the camera's field of view, providing +a robust solution for attendee recognition in large-scale social events. + +
+
+
+
+
+ + ☆ Deep Learning-Based Diffusion MRI Tractography: Integrating Spatial and + Anatomical Information + + +
+ Diffusion MRI tractography technique enables non-invasive visualization of +the white matter pathways in the brain. It plays a crucial role in neuroscience +and clinical fields by facilitating the study of brain connectivity and +neurological disorders. However, the accuracy of reconstructed tractograms has +been a longstanding challenge. Recently, deep learning methods have been +applied to improve tractograms for better white matter coverage, but often +comes at the expense of generating excessive false-positive connections. This +is largely due to their reliance on local information to predict long range +streamlines. To improve the accuracy of streamline propagation predictions, we +introduce a novel deep learning framework that integrates image-domain spatial +information and anatomical information along tracts, with the former extracted +through convolutional layers and the later modeled via a Transformer-decoder. +Additionally, we employ a weighted loss function to address fiber class +imbalance encountered during training. We evaluate the proposed method on the +simulated ISMRM 2015 Tractography Challenge dataset, achieving a valid +streamline rate of 66.2%, white matter coverage of 63.8%, and successfully +reconstructing 24 out of 25 bundles. Furthermore, on the multi-site +Tractoinferno dataset, the proposed method demonstrates its ability to handle +various diffusion MRI acquisition schemes, achieving a 5.7% increase in white +matter coverage and a 4.1% decrease in overreach compared to RNN-based methods. + +
+
+
+
+
+ + ☆ ScaleFusionNet: Transformer-Guided Multi-Scale Feature Fusion for Skin + Lesion Segmentation + + +
+ Melanoma is a malignant tumor originating from skin cell lesions. Accurate +and efficient segmentation of skin lesions is essential for quantitative +medical analysis but remains challenging. To address this, we propose +ScaleFusionNet, a segmentation model that integrates Cross-Attention +Transformer Module (CATM) and AdaptiveFusionBlock to enhance feature extraction +and fusion. The model employs a hybrid architecture encoder that effectively +captures both local and global features. We introduce CATM, which utilizes Swin +Transformer Blocks and Cross Attention Fusion (CAF) to adaptively refine +encoder-decoder feature fusion, reducing semantic gaps and improving +segmentation accuracy. Additionally, the AdaptiveFusionBlock is improved by +integrating adaptive multi-scale fusion, where Swin Transformer-based attention +complements deformable convolution-based multi-scale feature extraction. This +enhancement refines lesion boundaries and preserves fine-grained details. +ScaleFusionNet achieves Dice scores of 92.94% and 91.65% on ISIC-2016 and +ISIC-2018 datasets, respectively, demonstrating its effectiveness in skin +lesion analysis. Our code implementation is publicly available at GitHub. + +
+
+
+
+
+ + ☆ Golden Cudgel Network for Real-Time Semantic Segmentation + + +
+ Recent real-time semantic segmentation models, whether single-branch or +multi-branch, achieve good performance and speed. However, their speed is +limited by multi-path blocks, and some depend on high-performance teacher +models for training. To overcome these issues, we propose Golden Cudgel Network +(GCNet). Specifically, GCNet uses vertical multi-convolutions and horizontal +multi-paths for training, which are reparameterized into a single convolution +for inference, optimizing both performance and speed. This design allows GCNet +to self-enlarge during training and self-contract during inference, effectively +becoming a "teacher model" without needing external ones. Experimental results +show that GCNet outperforms existing state-of-the-art models in terms of +performance and speed on the Cityscapes, CamVid, and Pascal VOC 2012 datasets. +The code is available at https://github.com/gyyang23/GCNet. + +
+
+
+
+
+ + ☆ See What You Are Told: Visual Attention Sink in Large Multimodal Models + + +
+ Large multimodal models (LMMs) "see" images by leveraging the attention +mechanism between text and visual tokens in the transformer decoder. Ideally, +these models should focus on key visual information relevant to the text token. +However, recent findings indicate that LMMs have an extraordinary tendency to +consistently allocate high attention weights to specific visual tokens, even +when these tokens are irrelevant to the corresponding text. In this study, we +investigate the property behind the appearance of these irrelevant visual +tokens and examine their characteristics. Our findings show that this behavior +arises due to the massive activation of certain hidden state dimensions, which +resembles the attention sink found in language models. Hence, we refer to this +phenomenon as the visual attention sink. In particular, our analysis reveals +that removing the irrelevant visual sink tokens does not impact model +performance, despite receiving high attention weights. Consequently, we recycle +the attention to these tokens as surplus resources, redistributing the +attention budget to enhance focus on the image. To achieve this, we introduce +Visual Attention Redistribution (VAR), a method that redistributes attention in +image-centric heads, which we identify as innately focusing on visual +information. VAR can be seamlessly applied across different LMMs to improve +performance on a wide range of tasks, including general vision-language tasks, +visual hallucination tasks, and vision-centric tasks, all without the need for +additional training, models, or inference steps. Experimental results +demonstrate that VAR enables LMMs to process visual information more +effectively by adjusting their internal attention mechanisms, offering a new +direction to enhancing the multimodal capabilities of LMMs. + +
+
+
+
+
+ + ☆ Full-DoF Egomotion Estimation for Event Cameras Using Geometric Solvers CVPR + + +
+ For event cameras, current sparse geometric solvers for egomotion estimation +assume that the rotational displacements are known, such as those provided by +an IMU. Thus, they can only recover the translational motion parameters. +Recovering full-DoF motion parameters using a sparse geometric solver is a more +challenging task, and has not yet been investigated. In this paper, we propose +several solvers to estimate both rotational and translational velocities within +a unified framework. Our method leverages event manifolds induced by line +segments. The problem formulations are based on either an incidence relation +for lines or a novel coplanarity relation for normal vectors. We demonstrate +the possibility of recovering full-DoF egomotion parameters for both angular +and linear velocities without requiring extra sensor measurements or motion +priors. To achieve efficient optimization, we exploit the Adam framework with a +first-order approximation of rotations for quick initialization. Experiments on +both synthetic and real-world data demonstrate the effectiveness of our method. +The code is available at https://github.com/jizhaox/relpose-event. + +
+
+ comment: Accepted by IEEE/CVF Conference on Computer Vision and Pattern + Recognition (CVPR), 2025 +
+
+
+
+
+ + ☆ Label-Efficient LiDAR Semantic Segmentation with 2D-3D Vision + Transformer Adapters + + +
+ LiDAR semantic segmentation models are typically trained from random +initialization as universal pre-training is hindered by the lack of large, +diverse datasets. Moreover, most point cloud segmentation architectures +incorporate custom network layers, limiting the transferability of advances +from vision-based architectures. Inspired by recent advances in universal +foundation models, we propose BALViT, a novel approach that leverages frozen +vision models as amodal feature encoders for learning strong LiDAR encoders. +Specifically, BALViT incorporates both range-view and bird's-eye-view LiDAR +encoding mechanisms, which we combine through a novel 2D-3D adapter. While the +range-view features are processed through a frozen image backbone, our +bird's-eye-view branch enhances them through multiple cross-attention +interactions. Thereby, we continuously improve the vision network with +domain-dependent knowledge, resulting in a strong label-efficient LiDAR +encoding mechanism. Extensive evaluations of BALViT on the SemanticKITTI and +nuScenes benchmarks demonstrate that it outperforms state-of-the-art methods on +small data regimes. We make the code and models publicly available at: +http://balvit.cs.uni-freiburg.de. + +
+
+
+
+
+ + ☆ Interactive Segmentation and Report Generation for CT Images + + +
+ Automated CT report generation plays a crucial role in improving diagnostic +accuracy and clinical workflow efficiency. However, existing methods lack +interpretability and impede patient-clinician understanding, while their static +nature restricts radiologists from dynamically adjusting assessments during +image review. Inspired by interactive segmentation techniques, we propose a +novel interactive framework for 3D lesion morphology reporting that seamlessly +generates segmentation masks with comprehensive attribute descriptions, +enabling clinicians to generate detailed lesion profiles for enhanced +diagnostic assessment. To our best knowledge, we are the first to integrate the +interactive segmentation and structured reports in 3D CT medical images. +Experimental results across 15 lesion types demonstrate the effectiveness of +our approach in providing a more comprehensive and reliable reporting system +for lesion segmentation and capturing. The source code will be made publicly +available following paper acceptance. + +
+
+
+
+
+ + ☆ Deep Understanding of Sign Language for Sign to Subtitle Alignment + + +
+ The objective of this work is to align asynchronous subtitles in sign +language videos with limited labelled data. To achieve this goal, we propose a +novel framework with the following contributions: (1) we leverage fundamental +grammatical rules of British Sign Language (BSL) to pre-process the input +subtitles, (2) we design a selective alignment loss to optimise the model for +predicting the temporal location of signs only when the queried sign actually +occurs in a scene, and (3) we conduct self-training with refined pseudo-labels +which are more accurate than the heuristic audio-aligned labels. From this, our +model not only better understands the correlation between the text and the +signs, but also holds potential for application in the translation of sign +languages, particularly in scenarios where manual labelling of large-scale sign +data is impractical or challenging. Extensive experimental results demonstrate +that our approach achieves state-of-the-art results, surpassing previous +baselines by substantial margins in terms of both frame-level accuracy and +F1-score. This highlights the effectiveness and practicality of our framework +in advancing the field of sign language video alignment and translation. + +
+
+
+
+
+ + ☆ Enhancing Visual Forced Alignment with Local Context-Aware Feature + Extraction and Multi-Task Learning ICASSP2025 + + +
+ This paper introduces a novel approach to Visual Forced Alignment (VFA), +aiming to accurately synchronize utterances with corresponding lip movements, +without relying on audio cues. We propose a novel VFA approach that integrates +a local context-aware feature extractor and employs multi-task learning to +refine both global and local context features, enhancing sensitivity to subtle +lip movements for precise word-level and phoneme-level alignment. Incorporating +the improved Viterbi algorithm for post-processing, our method significantly +reduces misalignments. Experimental results show our approach outperforms +existing methods, achieving a 6% accuracy improvement at the word-level and 27% +improvement at the phoneme-level in LRS2 dataset. These improvements offer new +potential for applications in automatically subtitling TV shows or +user-generated content platforms like TikTok and YouTube Shorts. + +
+
+ comment: Accepted by ICASSP2025 +
+
+
+
+
+ + ☆ Enhancing Vietnamese VQA through Curriculum Learning on Raw and + Augmented Text Representations AAAI-25 + + +
+ Visual Question Answering (VQA) is a multimodal task requiring reasoning +across textual and visual inputs, which becomes particularly challenging in +low-resource languages like Vietnamese due to linguistic variability and the +lack of high-quality datasets. Traditional methods often rely heavily on +extensive annotated datasets, computationally expensive pipelines, and large +pre-trained models, specifically in the domain of Vietnamese VQA, limiting +their applicability in such scenarios. To address these limitations, we propose +a training framework that combines a paraphrase-based feature augmentation +module with a dynamic curriculum learning strategy. Explicitly, augmented +samples are considered "easy" while raw samples are regarded as "hard". The +framework then utilizes a mechanism that dynamically adjusts the ratio of easy +to hard samples during training, progressively modifying the same dataset to +increase its difficulty level. By enabling gradual adaptation to task +complexity, this approach helps the Vietnamese VQA model generalize well, thus +improving overall performance. Experimental results show consistent +improvements on the OpenViVQA dataset and mixed outcomes on the ViVQA dataset, +highlighting both the potential and challenges of our approach in advancing VQA +for Vietnamese language. + +
+
+ comment: 10 pages, 3 figures, AAAI-25 Workshop on Document Understanding and + Intelligence +
+
+
+
+
+ + ☆ Gaussian highpass guided image filtering + + +
+ Guided image filtering (GIF) is a popular smoothing technique, in which an +additional image is used as a structure guidance for noise removal with edge +preservation. The original GIF and some of its subsequent improvements are +derived from a two-parameter local affine model (LAM), where the filtering +output is a local affine transformation of the guidance image, but the input +image is not taken into account in the LAM formulation. In this paper, we first +introduce a single-parameter Prior Model based on Gaussian (highpass/lowpass) +Filtering (PM-GF), in which the filtering output is the sum of a weighted +portion of Gaussian highpass filtering of the guidance image and Gaussian +smoothing of the input image. In the PM-GF, the guidance structure determined +by Gaussian highpass filtering is obviously transferred to the filtering +output, thereby better revealing the structure transfer mechanism of guided +filtering. Then we propose several Gaussian highpass GIFs (GH-GIFs) based on +the PM-GF by emulating the original GIF and some improvements, i.e., using +PM-GF instead of LAM in these GIFs. Experimental results illustrate that the +proposed GIFs outperform their counterparts in several image processing +applications. + +
+
+
+
+
+ + ☆ BEVMOSNet: Multimodal Fusion for BEV Moving Object Segmentation + + +
+ Accurate motion understanding of the dynamic objects within the scene in +bird's-eye-view (BEV) is critical to ensure a reliable obstacle avoidance +system and smooth path planning for autonomous vehicles. However, this task has +received relatively limited exploration when compared to object detection and +segmentation with only a few recent vision-based approaches presenting +preliminary findings that significantly deteriorate in low-light, nighttime, +and adverse weather conditions such as rain. Conversely, LiDAR and radar +sensors remain almost unaffected in these scenarios, and radar provides key +velocity information of the objects. Therefore, we introduce BEVMOSNet, to our +knowledge, the first end-to-end multimodal fusion leveraging cameras, LiDAR, +and radar to precisely predict the moving objects in BEV. In addition, we +perform a deeper analysis to find out the optimal strategy for deformable +cross-attention-guided sensor fusion for cross-sensor knowledge sharing in BEV. +While evaluating BEVMOSNet on the nuScenes dataset, we show an overall +improvement in IoU score of 36.59% compared to the vision-based unimodal +baseline BEV-MoSeg (Sigatapu et al., 2023), and 2.35% compared to the +multimodel SimpleBEV (Harley et al., 2022), extended for the motion +segmentation task, establishing this method as the state-of-the-art in BEV +motion segmentation. + +
+
+ comment: In Proceedings of the 20th International Joint Conference on Computer + Vision, Imaging and Computer Graphics Theory and Applications (2025) +
+
+
+
+
+ + ☆ Enhancing Abnormality Grounding for Vision Language Models with + Knowledge Descriptions + + +
+ Visual Language Models (VLMs) have demonstrated impressive capabilities in +visual grounding tasks. However, their effectiveness in the medical domain, +particularly for abnormality detection and localization within medical images, +remains underexplored. A major challenge is the complex and abstract nature of +medical terminology, which makes it difficult to directly associate +pathological anomaly terms with their corresponding visual features. In this +work, we introduce a novel approach to enhance VLM performance in medical +abnormality detection and localization by leveraging decomposed medical +knowledge. Instead of directly prompting models to recognize specific +abnormalities, we focus on breaking down medical concepts into fundamental +attributes and common visual patterns. This strategy promotes a stronger +alignment between textual descriptions and visual features, improving both the +recognition and localization of abnormalities in medical images.We evaluate our +method on the 0.23B Florence-2 base model and demonstrate that it achieves +comparable performance in abnormality grounding to significantly larger 7B +LLaVA-based medical VLMs, despite being trained on only 1.5% of the data used +for such models. Experimental results also demonstrate the effectiveness of our +approach in both known and previously unseen abnormalities, suggesting its +strong generalization capabilities. + +
+
+ comment: 11 pages, 3 figures +
+
+
+
+
+ + ☆ Towards Effective and Sparse Adversarial Attack on Spiking Neural + Networks via Breaking Invisible Surrogate Gradients CVPR 2025 + + +
+ Spiking neural networks (SNNs) have shown their competence in handling +spatial-temporal event-based data with low energy consumption. Similar to +conventional artificial neural networks (ANNs), SNNs are also vulnerable to +gradient-based adversarial attacks, wherein gradients are calculated by +spatial-temporal back-propagation (STBP) and surrogate gradients (SGs). +However, the SGs may be invisible for an inference-only model as they do not +influence the inference results, and current gradient-based attacks are +ineffective for binary dynamic images captured by the dynamic vision sensor +(DVS). While some approaches addressed the issue of invisible SGs through +universal SGs, their SGs lack a correlation with the victim model, resulting in +sub-optimal performance. Moreover, the imperceptibility of existing SNN-based +binary attacks is still insufficient. In this paper, we introduce an innovative +potential-dependent surrogate gradient (PDSG) method to establish a robust +connection between the SG and the model, thereby enhancing the adaptability of +adversarial attacks across various models with invisible SGs. Additionally, we +propose the sparse dynamic attack (SDA) to effectively attack binary dynamic +images. Utilizing a generation-reduction paradigm, SDA can fully optimize the +sparsity of adversarial perturbations. Experimental results demonstrate that +our PDSG and SDA outperform state-of-the-art SNN-based attacks across various +models and datasets. Specifically, our PDSG achieves 100% attack success rate +on ImageNet, and our SDA obtains 82% attack success rate by modifying only +0.24% of the pixels on CIFAR10DVS. The code is available at +https://github.com/ryime/PDSG-SDA . + +
+
+ comment: Accepted by CVPR 2025 +
+
+
+
+
+ + ☆ Reduced Spatial Dependency for More General Video-level Deepfake + Detection ICASSP 2025 + + +
+ As one of the prominent AI-generated content, Deepfake has raised significant +safety concerns. Although it has been demonstrated that temporal consistency +cues offer better generalization capability, existing methods based on CNNs +inevitably introduce spatial bias, which hinders the extraction of intrinsic +temporal features. To address this issue, we propose a novel method called +Spatial Dependency Reduction (SDR), which integrates common temporal +consistency features from multiple spatially-perturbed clusters, to reduce the +dependency of the model on spatial information. Specifically, we design +multiple Spatial Perturbation Branch (SPB) to construct spatially-perturbed +feature clusters. Subsequently, we utilize the theory of mutual information and +propose a Task-Relevant Feature Integration (TRFI) module to capture temporal +features residing in similar latent space from these clusters. Finally, the +integrated feature is fed into a temporal transformer to capture long-range +dependencies. Extensive benchmarks and ablation studies demonstrate the +effectiveness and rationale of our approach. + +
+
+ comment: 5 pages, 2 figures. Accepted to ICASSP 2025 +
+
+
+
+
+ + ☆ Optimizing for the Shortest Path in Denoising Diffusion Model CVPR 2025 + + +
+ In this research, we propose a novel denoising diffusion model based on +shortest-path modeling that optimizes residual propagation to enhance both +denoising efficiency and quality.Drawing on Denoising Diffusion Implicit Models +(DDIM) and insights from graph theory, our model, termed the Shortest Path +Diffusion Model (ShortDF), treats the denoising process as a shortest-path +problem aimed at minimizing reconstruction error. By optimizing the initial +residuals, we improve the efficiency of the reverse diffusion process and the +quality of the generated samples.Extensive experiments on multiple standard +benchmarks demonstrate that ShortDF significantly reduces diffusion time (or +steps) while enhancing the visual fidelity of generated samples compared to +prior arts.This work, we suppose, paves the way for interactive diffusion-based +applications and establishes a foundation for rapid data generation. Code is +available at https://github.com/UnicomAI/ShortDF. + +
+
+ comment: Accepet by CVPR 2025 (10 pages, 6 figures) +
+
+
+
+
+ + ☆ Trajectory Prediction for Autonomous Driving: Progress, Limitations, and + Future Directions + + +
+ As the potential for autonomous vehicles to be integrated on a large scale +into modern traffic systems continues to grow, ensuring safe navigation in +dynamic environments is crucial for smooth integration. To guarantee safety and +prevent collisions, autonomous vehicles must be capable of accurately +predicting the trajectories of surrounding traffic agents. Over the past +decade, significant efforts from both academia and industry have been dedicated +to designing solutions for precise trajectory forecasting. These efforts have +produced a diverse range of approaches, raising questions about the differences +between these methods and whether trajectory prediction challenges have been +fully addressed. This paper reviews a substantial portion of recent trajectory +prediction methods and devises a taxonomy to classify existing solutions. A +general overview of the prediction pipeline is also provided, covering input +and output modalities, modeling features, and prediction paradigms discussed in +the literature. In addition, the paper discusses active research areas within +trajectory prediction, addresses the posed research questions, and highlights +the remaining research gaps and challenges. + +
+
+
+
+
+ + ☆ BANet: Bilateral Aggregation Network for Mobile Stereo Matching + + +
+ State-of-the-art stereo matching methods typically use costly 3D convolutions +to aggregate a full cost volume, but their computational demands make mobile +deployment challenging. Directly applying 2D convolutions for cost aggregation +often results in edge blurring, detail loss, and mismatches in textureless +regions. Some complex operations, like deformable convolutions and iterative +warping, can partially alleviate this issue; however, they are not +mobile-friendly, limiting their deployment on mobile devices. In this paper, we +present a novel bilateral aggregation network (BANet) for mobile stereo +matching that produces high-quality results with sharp edges and fine details +using only 2D convolutions. Specifically, we first separate the full cost +volume into detailed and smooth volumes using a spatial attention map, then +perform detailed and smooth aggregations accordingly, ultimately fusing both to +obtain the final disparity map. Additionally, to accurately identify +high-frequency detailed regions and low-frequency smooth/textureless regions, +we propose a new scale-aware spatial attention module. Experimental results +demonstrate that our BANet-2D significantly outperforms other mobile-friendly +methods, achieving 35.3\% higher accuracy on the KITTI 2015 leaderboard than +MobileStereoNet-2D, with faster runtime on mobile devices. The extended 3D +version, BANet-3D, achieves the highest accuracy among all real-time methods on +high-end GPUs. Code: \textcolor{magenta}{https://github.com/gangweiX/BANet}. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ BAT: Learning Event-based Optical Flow with Bidirectional Adaptive + Temporal Correlation + + +
+ Event cameras deliver visual information characterized by a high dynamic +range and high temporal resolution, offering significant advantages in +estimating optical flow for complex lighting conditions and fast-moving +objects. Current advanced optical flow methods for event cameras largely adopt +established image-based frameworks. However, the spatial sparsity of event data +limits their performance. In this paper, we present BAT, an innovative +framework that estimates event-based optical flow using bidirectional adaptive +temporal correlation. BAT includes three novel designs: 1) a bidirectional +temporal correlation that transforms bidirectional temporally dense motion cues +into spatially dense ones, enabling accurate and spatially dense optical flow +estimation; 2) an adaptive temporal sampling strategy for maintaining temporal +consistency in correlation; 3) spatially adaptive temporal motion aggregation +to efficiently and adaptively aggregate consistent target motion features into +adjacent motion features while suppressing inconsistent ones. Our results rank +$1^{st}$ on the DSEC-Flow benchmark, outperforming existing state-of-the-art +methods by a large margin while also exhibiting sharp edges and high-quality +details. Notably, our BAT can accurately predict future optical flow using only +past events, significantly outperforming E-RAFT's warm-start approach. Code: +\textcolor{magenta}{https://github.com/gangweiX/BAT}. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Computational Analysis of Degradation Modeling in Blind Panoramic Image + Quality Assessment + + +
+ Blind panoramic image quality assessment (BPIQA) has recently brought new +challenge to the visual quality community, due to the complex interaction +between immersive content and human behavior. Although many efforts have been +made to advance BPIQA from both conducting psychophysical experiments and +designing performance-driven objective algorithms, \textit{limited content} and +\textit{few samples} in those closed sets inevitably would result in shaky +conclusions, thereby hindering the development of BPIQA, we refer to it as the +\textit{easy-database} issue. In this paper, we present a sufficient +computational analysis of degradation modeling in BPIQA to thoroughly explore +the \textit{easy-database issue}, where we carefully design three types of +experiments via investigating the gap between BPIQA and blind image quality +assessment (BIQA), the necessity of specific design in BPIQA models, and the +generalization ability of BPIQA models. From extensive experiments, we find +that easy databases narrow the gap between the performance of BPIQA and BIQA +models, which is unconducive to the development of BPIQA. And the easy +databases make the BPIQA models be closed to saturation, therefore the +effectiveness of the associated specific designs can not be well verified. +Besides, the BPIQA models trained on our recently proposed databases with +complicated degradation show better generalization ability. Thus, we believe +that much more efforts are highly desired to put into BPIQA from both +subjective viewpoint and objective viewpoint. + +
+
+
+
+
+ + ☆ Two-Stream Thermal Imaging Fusion for Enhanced Time of Birth Detection + in Neonatal Care + + +
+ Around 10% of newborns require some help to initiate breathing, and 5\% need +ventilation assistance. Accurate Time of Birth (ToB) documentation is essential +for optimizing neonatal care, as timely interventions are vital for proper +resuscitation. However, current clinical methods for recording ToB often rely +on manual processes, which can be prone to inaccuracies. In this study, we +present a novel two-stream fusion system that combines the power of image and +video analysis to accurately detect the ToB from thermal recordings in the +delivery room and operating theater. By integrating static and dynamic streams, +our approach captures richer birth-related spatiotemporal features, leading to +more robust and precise ToB estimation. We demonstrate that this synergy +between data modalities enhances performance over single-stream approaches. Our +system achieves 95.7% precision and 84.8% recall in detecting birth within +short video clips. Additionally, with the help of a score aggregation module, +it successfully identifies ToB in 100% of test cases, with a median absolute +error of 2 seconds and an absolute mean deviation of 4.5 seconds compared to +manual annotations. + +
+
+ comment: Submitted to IEEE 25th International Conference on Digital Signal + Processing +
+
+
+
+
+ + ☆ GenColor: Generative Color-Concept Association in Visual Design + + +
+ Existing approaches for color-concept association typically rely on +query-based image referencing, and color extraction from image references. +However, these approaches are effective only for common concepts, and are +vulnerable to unstable image referencing and varying image conditions. Our +formative study with designers underscores the need for primary-accent color +compositions and context-dependent colors (e.g., 'clear' vs. 'polluted' sky) in +design. In response, we introduce a generative approach for mining semantically +resonant colors leveraging images generated by text-to-image models. Our +insight is that contemporary text-to-image models can resemble visual patterns +from large-scale real-world data. The framework comprises three stages: concept +instancing produces generative samples using diffusion models, text-guided +image segmentation identifies concept-relevant regions within the image, and +color association extracts primarily accompanied by accent colors. Quantitative +comparisons with expert designs validate our approach's effectiveness, and we +demonstrate the applicability through cases in various design scenarios and a +gallery. + +
+
+ comment: 19 pages, 16 figures. Accepted at CHI Conference on Human Factors in + Computing Systems (CHI'25), April 26-May 1, 2025, Yokohama, Japan +
+
+
+
+
+ + ☆ Path-Adaptive Matting for Efficient Inference Under Various + Computational Cost Constraints AAAI 2025 + + +
+ In this paper, we explore a novel image matting task aimed at achieving +efficient inference under various computational cost constraints, specifically +FLOP limitations, using a single matting network. Existing matting methods +which have not explored scalable architectures or path-learning strategies, +fail to tackle this challenge. To overcome these limitations, we introduce +Path-Adaptive Matting (PAM), a framework that dynamically adjusts network paths +based on image contexts and computational cost constraints. We formulate the +training of the computational cost-constrained matting network as a bilevel +optimization problem, jointly optimizing the matting network and the path +estimator. Building on this formalization, we design a path-adaptive matting +architecture by incorporating path selection layers and learnable connect +layers to estimate optimal paths and perform efficient inference within a +unified network. Furthermore, we propose a performance-aware path-learning +strategy to generate path labels online by evaluating a few paths sampled from +the prior distribution of optimal paths and network estimations, enabling +robust and efficient online path learning. Experiments on five image matting +datasets demonstrate that the proposed PAM framework achieves competitive +performance across a range of computational cost constraints. + +
+
+ comment: Accepted to AAAI 2025 +
+
+
+
+
+ + ☆ Mocap-2-to-3: Lifting 2D Diffusion-Based Pretrained Models for 3D Motion + Capture + + +
+ Recovering absolute poses in the world coordinate system from monocular views +presents significant challenges. Two primary issues arise in this context. +Firstly, existing methods rely on 3D motion data for training, which requires +collection in limited environments. Acquiring such 3D labels for new actions in +a timely manner is impractical, severely restricting the model's generalization +capabilities. In contrast, 2D poses are far more accessible and easier to +obtain. Secondly, estimating a person's absolute position in metric space from +a single viewpoint is inherently more complex. To address these challenges, we +introduce Mocap-2-to-3, a novel framework that decomposes intricate 3D motions +into 2D poses, leveraging 2D data to enhance 3D motion reconstruction in +diverse scenarios and accurately predict absolute positions in the world +coordinate system. We initially pretrain a single-view diffusion model with +extensive 2D data, followed by fine-tuning a multi-view diffusion model for +view consistency using publicly available 3D data. This strategy facilitates +the effective use of large-scale 2D data. Additionally, we propose an +innovative human motion representation that decouples local actions from global +movements and encodes geometric priors of the ground, ensuring the generative +model learns accurate motion priors from 2D data. During inference, this allows +for the gradual recovery of global movements, resulting in more plausible +positioning. We evaluate our model's performance on real-world datasets, +demonstrating superior accuracy in motion and absolute human positioning +compared to state-of-the-art methods, along with enhanced generalization and +scalability. Our code will be made publicly available. + +
+
+
+
+
+ + ☆ Rice Grain Size Measurement using Image Processing + + +
+ The rice grain quality can be determined from its size and chalkiness. The +traditional approach to measure the rice grain size involves manual inspection, +which is inefficient and leads to inconsistent results. To address this issue, +an image processing based approach is proposed and developed in this research. +The approach takes image of rice grains as input and outputs the number of rice +grains and size of each rice grain. The different steps, such as extraction of +region of interest, segmentation of rice grains, and sub-contours removal, +involved in the proposed approach are discussed. The approach was tested on +rice grain images captured from different height using mobile phone camera. The +obtained results show that the proposed approach successfully detected 95\% of +the rice grains and achieved 90\% accuracy for length and width measurement. + +
+
+
+
+
+ + ☆ An Analytical Theory of Power Law Spectral Bias in the Learning Dynamics + of Diffusion Models + + +
+ We developed an analytical framework for understanding how the learned +distribution evolves during diffusion model training. Leveraging the Gaussian +equivalence principle, we derived exact solutions for the gradient-flow +dynamics of weights in one- or two-layer linear denoiser settings with +arbitrary data. Remarkably, these solutions allowed us to derive the generated +distribution in closed form and its KL divergence through training. These +analytical results expose a pronounced power-law spectral bias, i.e., for +weights and distributions, the convergence time of a mode follows an inverse +power law of its variance. Empirical experiments on both Gaussian and image +datasets demonstrate that the power-law spectral bias remains robust even when +using deeper or convolutional architectures. Our results underscore the +importance of the data covariance in dictating the order and rate at which +diffusion models learn different modes of the data, providing potential +explanations for why earlier stopping could lead to incorrect details in image +generative models. + +
+
+ comment: 50 pages, 10 figures. Preprint +
+
+
+
+
+ + ☆ Find Matching Faces Based On Face Parameters + + +
+ This paper presents an innovative approach that enables the user to find +matching faces based on the user-selected face parameters. Through gradio-based +user interface, the users can interactively select the face parameters they +want in their desired partner. These user-selected face parameters are +transformed into a text prompt which is used by the Text-To-Image generation +model to generate a realistic face image. Further, the generated image along +with the images downloaded from the Jeevansathi.com are processed through face +detection and feature extraction model, which results in high dimensional +vector embedding of 512 dimensions. The vector embeddings generated from the +downloaded images are stored into vector database. Now, the similarity search +is carried out between the vector embedding of generated image and the stored +vector embeddings. As a result, it displays the top five similar faces based on +the user-selected face parameters. This contribution holds a significant +potential to turn into a high-quality personalized face matching tool. + +
+
+
+
+
+ + ☆ Variance-Aware Loss Scheduling for Multimodal Alignment in Low-Data + Settings + + +
+ Training vision-language models for image-text alignment typically requires +large datasets to achieve robust performance. In low-data scenarios, standard +contrastive learning can struggle to align modalities effectively due to +overfitting and unstable training dynamics. In this paper, we propose a +variance-aware loss scheduling approach that dynamically adjusts the weighting +of the contrastive loss based on the statistical variability (uncertainty) in +the model's alignment predictions. Using a subset of the Flickr8k image-caption +dataset to simulate limited data conditions, we demonstrate that our approach +improves image-text retrieval accuracy compared to a fixed-weight baseline. We +also compare against other adaptive weighting strategies (using output entropy +and cosine similarity spread) and find that variance-aware scheduling provides +the best overall trade-off. Qualitatively, our method yields more distinct +multimodal embeddings as shown by t-SNE visualizations. Moreover, in a stress +test with noise-injected captions and images, the variance-guided loss proves +more robust, maintaining higher recall when random perturbations are +introduced. These results highlight the benefit of adaptive loss weighting for +multimodal alignment in low-data regimes. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ Transformer-Based Spatio-Temporal Association of Apple Fruitlets + + +
+ In this paper, we present a transformer-based method to spatio-temporally +associate apple fruitlets in stereo-images collected on different days and from +different camera poses. State-of-the-art association methods in agriculture are +dedicated towards matching larger crops using either high-resolution point +clouds or temporally stable features, which are both difficult to obtain for +smaller fruit in the field. To address these challenges, we propose a +transformer-based architecture that encodes the shape and position of each +fruitlet, and propagates and refines these features through a series of +transformer encoder layers with alternating self and cross-attention. We +demonstrate that our method is able to achieve an F1-score of 92.4% on data +collected in a commercial apple orchard and outperforms all baselines and +ablations. + +
+
+
+
+
+ + ☆ SpiritSight Agent: Advanced GUI Agent with One Look CVPR 2025 + + +
+ Graphical User Interface (GUI) agents show amazing abilities in assisting +human-computer interaction, automating human user's navigation on digital +devices. An ideal GUI agent is expected to achieve high accuracy, low latency, +and compatibility for different GUI platforms. Recent vision-based approaches +have shown promise by leveraging advanced Vision Language Models (VLMs). While +they generally meet the requirements of compatibility and low latency, these +vision-based GUI agents tend to have low accuracy due to their limitations in +element grounding. To address this issue, we propose $\textbf{SpiritSight}$, a +vision-based, end-to-end GUI agent that excels in GUI navigation tasks across +various GUI platforms. First, we create a multi-level, large-scale, +high-quality GUI dataset called $\textbf{GUI-Lasagne}$ using scalable methods, +empowering SpiritSight with robust GUI understanding and grounding +capabilities. Second, we introduce the $\textbf{Universal Block Parsing (UBP)}$ +method to resolve the ambiguity problem in dynamic high-resolution of visual +inputs, further enhancing SpiritSight's ability to ground GUI objects. Through +these efforts, SpiritSight agent outperforms other advanced methods on diverse +GUI benchmarks, demonstrating its superior capability and compatibility in GUI +navigation tasks. Models are available at +$\href{https://huggingface.co/SenseLLM/SpiritSight-Agent-8B}{this\ URL}$. + +
+
+ comment: Paper accepted to CVPR 2025 +
+
+
+
+
+ + ☆ DSPNet: Dual-vision Scene Perception for Robust 3D Question Answering + + +
+ 3D Question Answering (3D QA) requires the model to comprehensively +understand its situated 3D scene described by the text, then reason about its +surrounding environment and answer a question under that situation. However, +existing methods usually rely on global scene perception from pure 3D point +clouds and overlook the importance of rich local texture details from +multi-view images. Moreover, due to the inherent noise in camera poses and +complex occlusions, there exists significant feature degradation and reduced +feature robustness problems when aligning 3D point cloud with multi-view +images. In this paper, we propose a Dual-vision Scene Perception Network +(DSPNet), to comprehensively integrate multi-view and point cloud features to +improve robustness in 3D QA. Our Text-guided Multi-view Fusion (TGMF) module +prioritizes image views that closely match the semantic content of the text. To +adaptively fuse back-projected multi-view images with point cloud features, we +design the Adaptive Dual-vision Perception (ADVP) module, enhancing 3D scene +comprehension. Additionally, our Multimodal Context-guided Reasoning (MCGR) +module facilitates robust reasoning by integrating contextual information +across visual and linguistic modalities. Experimental results on SQA3D and +ScanQA datasets demonstrate the superiority of our DSPNet. Codes will be +available at https://github.com/LZ-CH/DSPNet. + +
+
+
+
+
+ + ☆ Partial Convolution Meets Visual Attention + + +
+ Designing an efficient and effective neural network has remained a prominent +topic in computer vision research. Depthwise onvolution (DWConv) is widely used +in efficient CNNs or ViTs, but it needs frequent memory access during +inference, which leads to low throughput. FasterNet attempts to introduce +partial convolution (PConv) as an alternative to DWConv but compromises the +accuracy due to underutilized channels. To remedy this shortcoming and consider +the redundancy between feature map channels, we introduce a novel Partial +visual ATtention mechanism (PAT) that can efficiently combine PConv with visual +attention. Our exploration indicates that the partial attention mechanism can +completely replace the full attention mechanism and reduce model parameters and +FLOPs. Our PAT can derive three types of blocks: Partial Channel-Attention +block (PAT_ch), Partial Spatial-Attention block (PAT_sp) and Partial +Self-Attention block (PAT_sf). First, PAT_ch integrates the enhanced Gaussian +channel attention mechanism to infuse global distribution information into the +untouched channels of PConv. Second, we introduce the spatial-wise attention to +the MLP layer to further improve model accuracy. Finally, we replace PAT_ch in +the last stage with the self-attention mechanism to extend the global receptive +field. Building upon PAT, we propose a novel hybrid network family, named +PATNet, which achieves superior top-1 accuracy and inference speed compared to +FasterNet on ImageNet-1K classification and excel in both detection and +segmentation on the COCO dataset. Particularly, our PATNet-T2 achieves 1.3% +higher accuracy than FasterNet-T2, while exhibiting 25% higher GPU throughput +and 24% lower CPU latency. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2502.01303 +
+
+
+
+
+ + ☆ Temporal Separation with Entropy Regularization for Knowledge + Distillation in Spiking Neural Networks CVPR 2025 + + +
+ Spiking Neural Networks (SNNs), inspired by the human brain, offer +significant computational efficiency through discrete spike-based information +transfer. Despite their potential to reduce inference energy consumption, a +performance gap persists between SNNs and Artificial Neural Networks (ANNs), +primarily due to current training methods and inherent model limitations. While +recent research has aimed to enhance SNN learning by employing knowledge +distillation (KD) from ANN teacher networks, traditional distillation +techniques often overlook the distinctive spatiotemporal properties of SNNs, +thus failing to fully leverage their advantages. To overcome these challenge, +we propose a novel logit distillation method characterized by temporal +separation and entropy regularization. This approach improves existing SNN +distillation techniques by performing distillation learning on logits across +different time steps, rather than merely on aggregated output features. +Furthermore, the integration of entropy regularization stabilizes model +optimization and further boosts the performance. Extensive experimental results +indicate that our method surpasses prior SNN distillation strategies, whether +based on logit distillation, feature distillation, or a combination of both. +The code will be available on GitHub. + +
+
+ comment: Accepted by CVPR 2025 +
+
+
+
+
+ + ☆ Implicit U-KAN2.0: Dynamic, Efficient and Interpretable Medical Image + Segmentation + + +
+ Image segmentation is a fundamental task in both image analysis and medical +applications. State-of-the-art methods predominantly rely on encoder-decoder +architectures with a U-shaped design, commonly referred to as U-Net. Recent +advancements integrating transformers and MLPs improve performance but still +face key limitations, such as poor interpretability, difficulty handling +intrinsic noise, and constrained expressiveness due to discrete layer +structures, often lacking a solid theoretical foundation.In this work, we +introduce Implicit U-KAN 2.0, a novel U-Net variant that adopts a two-phase +encoder-decoder structure. In the SONO phase, we use a second-order neural +ordinary differential equation (NODEs), called the SONO block, for a more +efficient, expressive, and theoretically grounded modeling approach. In the +SONO-MultiKAN phase, we integrate the second-order NODEs and MultiKAN layer as +the core computational block to enhance interpretability and representation +power. Our contributions are threefold. First, U-KAN 2.0 is an implicit deep +neural network incorporating MultiKAN and second order NODEs, improving +interpretability and performance while reducing computational costs. Second, we +provide a theoretical analysis demonstrating that the approximation ability of +the MultiKAN block is independent of the input dimension. Third, we conduct +extensive experiments on a variety of 2D and a single 3D dataset, demonstrating +that our model consistently outperforms existing segmentation networks. + +
+
+
+
+
+ + ☆ Dynamic Neural Surfaces for Elastic 4D Shape Representation and Analysis + + +
+ We propose a novel framework for the statistical analysis of genus-zero 4D +surfaces, i.e., 3D surfaces that deform and evolve over time. This problem is +particularly challenging due to the arbitrary parameterizations of these +surfaces and their varying deformation speeds, necessitating effective +spatiotemporal registration. Traditionally, 4D surfaces are discretized, in +space and time, before computing their spatiotemporal registrations, geodesics, +and statistics. However, this approach may result in suboptimal solutions and, +as we demonstrate in this paper, is not necessary. In contrast, we treat 4D +surfaces as continuous functions in both space and time. We introduce Dynamic +Spherical Neural Surfaces (D-SNS), an efficient smooth and continuous +spatiotemporal representation for genus-0 4D surfaces. We then demonstrate how +to perform core 4D shape analysis tasks such as spatiotemporal registration, +geodesics computation, and mean 4D shape estimation, directly on these +continuous representations without upfront discretization and meshing. By +integrating neural representations with classical Riemannian geometry and +statistical shape analysis techniques, we provide the building blocks for +enabling full functional shape analysis. We demonstrate the efficiency of the +framework on 4D human and face datasets. The source code and additional results +are available at https://4d-dsns.github.io/DSNS/. + +
+
+ comment: 22 pages, 23 figures, conference paper +
+
+
+
+
+ + ☆ NTR-Gaussian: Nighttime Dynamic Thermal Reconstruction with 4D Gaussian + Splatting Based on Thermodynamics + + +
+ Thermal infrared imaging offers the advantage of all-weather capability, +enabling non-intrusive measurement of an object's surface temperature. +Consequently, thermal infrared images are employed to reconstruct 3D models +that accurately reflect the temperature distribution of a scene, aiding in +applications such as building monitoring and energy management. However, +existing approaches predominantly focus on static 3D reconstruction for a +single time period, overlooking the impact of environmental factors on thermal +radiation and failing to predict or analyze temperature variations over time. +To address these challenges, we propose the NTR-Gaussian method, which treats +temperature as a form of thermal radiation, incorporating elements like +convective heat transfer and radiative heat dissipation. Our approach utilizes +neural networks to predict thermodynamic parameters such as emissivity, +convective heat transfer coefficient, and heat capacity. By integrating these +predictions, we can accurately forecast thermal temperatures at various times +throughout a nighttime scene. Furthermore, we introduce a dynamic dataset +specifically for nighttime thermal imagery. Extensive experiments and +evaluations demonstrate that NTR-Gaussian significantly outperforms comparison +methods in thermal reconstruction, achieving a predicted temperature error +within 1 degree Celsius. + +
+
+ comment: IEEE Conference on Computer Vision and Pattern Recognition 2025 +
+
+
+
+
+ + ☆ An Improved Pure Fully Connected Neural Network for Rice Grain + Classification + + +
+ Rice is a staple food for a significant portion of the world's population, +providing essential nutrients and serving as a versatile in-gredient in a wide +range of culinary traditions. Recently, the use of deep learning has enabled +automated classification of rice, im-proving accuracy and efficiency. However, +classical models based on first-stage training may face difficulties in +distinguishing between rice varieties with similar external characteristics, +thus leading to misclassifications. Considering the transparency and +feasibility of model, we selected and gradually improved pure fully connected +neural network to achieve classification of rice grain. The dataset we used +contains both global and domestic rice images obtained from websites and +laboratories respectively. First, the training mode was changed from one-stage +training to two-stage training, which significantly contributes to +distinguishing two similar types of rice. Secondly, the preprocessing method +was changed from random tilting to horizontal or vertical position cor-rection. +After those two enhancements, the accuracy of our model increased notably from +97% to 99%. In summary, two subtle methods proposed in this study can +remarkably enhance the classification ability of deep learning models in terms +of the classification of rice grain. + +
+
+
+
+
+ + ☆ WarmFed: Federated Learning with Warm-Start for Globalization and + Personalization Via Personalized Diffusion Models + + +
+ Federated Learning (FL) stands as a prominent distributed learning paradigm +among multiple clients to achieve a unified global model without privacy +leakage. In contrast to FL, Personalized federated learning aims at serving for +each client in achieving persoanlized model. However, previous FL frameworks +have grappled with a dilemma: the choice between developing a singular global +model at the server to bolster globalization or nurturing personalized model at +the client to accommodate personalization. Instead of making trade-offs, this +paper commences its discourse from the pre-trained initialization, obtaining +resilient global information and facilitating the development of both global +and personalized models. Specifically, we propose a novel method called WarmFed +to achieve this. WarmFed customizes Warm-start through personalized diffusion +models, which are generated by local efficient fine-tunining (LoRA). Building +upon the Warm-Start, we advance a server-side fine-tuning strategy to derive +the global model, and propose a dynamic self-distillation (DSD) to procure more +resilient personalized models simultaneously. Comprehensive experiments +underscore the substantial gains of our approach across both global and +personalized models, achieved within just one-shot and five communication(s). + +
+
+
+
+
+ + ☆ RVAFM: Re-parameterizing Vertical Attention Fusion Module for + Handwritten Paragraph Text Recognition + + +
+ Handwritten Paragraph Text Recognition (HPTR) is a challenging task in +Computer Vision, requiring the transformation of a paragraph text image, rich +in handwritten text, into text encoding sequences. One of the most advanced +models for this task is Vertical Attention Network (VAN), which utilizes a +Vertical Attention Module (VAM) to implicitly segment paragraph text images +into text lines, thereby reducing the difficulty of the recognition task. +However, from a network structure perspective, VAM is a single-branch module, +which is less effective in learning compared to multi-branch modules. In this +paper, we propose a new module, named Re-parameterizing Vertical Attention +Fusion Module (RVAFM), which incorporates structural re-parameterization +techniques. RVAFM decouples the structure of the module during training and +inference stages. During training, it uses a multi-branch structure for more +effective learning, and during inference, it uses a single-branch structure for +faster processing. The features learned by the multi-branch structure are fused +into the single-branch structure through a special fusion method named +Re-parameterization Fusion (RF) without any loss of information. As a result, +we achieve a Character Error Rate (CER) of 4.44% and a Word Error Rate (WER) of +14.37% on the IAM paragraph-level test set. Additionally, the inference speed +is slightly faster than VAN. + +
+
+
+
+
+ + ☆ AHCPTQ: Accurate and Hardware-Compatible Post-Training Quantization for + Segment Anything Model + + +
+ The Segment Anything Model (SAM) has demonstrated strong versatility across +various visual tasks. However, its large storage requirements and high +computational cost pose challenges for practical deployment. Post-training +quantization (PTQ) has emerged as an effective strategy for efficient +deployment, but we identify two key challenges in SAM that hinder the +effectiveness of existing PTQ methods: the heavy-tailed and skewed distribution +of post-GELU activations, and significant inter-channel variation in linear +projection activations. To address these challenges, we propose AHCPTQ, an +accurate and hardware-efficient PTQ method for SAM. AHCPTQ introduces +hardware-compatible Hybrid Log-Uniform Quantization (HLUQ) to manage post-GELU +activations, employing log2 quantization for dense small values and uniform +quantization for sparse large values to enhance quantization resolution. +Additionally, AHCPTQ incorporates Channel-Aware Grouping (CAG) to mitigate +inter-channel variation by progressively clustering activation channels with +similar distributions, enabling them to share quantization parameters and +improving hardware efficiency. The combination of HLUQ and CAG not only +enhances quantization effectiveness but also ensures compatibility with +efficient hardware execution. For instance, under the W4A4 configuration on the +SAM-L model, AHCPTQ achieves 36.6% mAP on instance segmentation with the DINO +detector, while achieving a 7.89x speedup and 8.64x energy efficiency over its +floating-point counterpart in FPGA implementation. + +
+
+
+
+
+ + ☆ BEVDriver: Leveraging BEV Maps in LLMs for Robust Closed-Loop Driving + + +
+ Autonomous driving has the potential to set the stage for more efficient +future mobility, requiring the research domain to establish trust through safe, +reliable and transparent driving. Large Language Models (LLMs) possess +reasoning capabilities and natural language understanding, presenting the +potential to serve as generalized decision-makers for ego-motion planning that +can interact with humans and navigate environments designed for human drivers. +While this research avenue is promising, current autonomous driving approaches +are challenged by combining 3D spatial grounding and the reasoning and language +capabilities of LLMs. We introduce BEVDriver, an LLM-based model for end-to-end +closed-loop driving in CARLA that utilizes latent BEV features as perception +input. BEVDriver includes a BEV encoder to efficiently process multi-view +images and 3D LiDAR point clouds. Within a common latent space, the BEV +features are propagated through a Q-Former to align with natural language +instructions and passed to the LLM that predicts and plans precise future +trajectories while considering navigation instructions and critical scenarios. +On the LangAuto benchmark, our model reaches up to 18.9% higher performance on +the Driving Score compared to SoTA methods. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ Multi-View Depth Consistent Image Generation Using Generative AI Models: + Application on Architectural Design of University Buildings + + +
+ In the early stages of architectural design, shoebox models are typically +used as a simplified representation of building structures but require +extensive operations to transform them into detailed designs. Generative +artificial intelligence (AI) provides a promising solution to automate this +transformation, but ensuring multi-view consistency remains a significant +challenge. To solve this issue, we propose a novel three-stage consistent image +generation framework using generative AI models to generate architectural +designs from shoebox model representations. The proposed method enhances +state-of-the-art image generation diffusion models to generate multi-view +consistent architectural images. We employ ControlNet as the backbone and +optimize it to accommodate multi-view inputs of architectural shoebox models +captured from predefined perspectives. To ensure stylistic and structural +consistency across multi-view images, we propose an image space loss module +that incorporates style loss, structural loss and angle alignment loss. We then +use depth estimation method to extract depth maps from the generated multi-view +images. Finally, we use the paired data of the architectural images and depth +maps as inputs to improve the multi-view consistency via the depth-aware 3D +attention module. Experimental results demonstrate that the proposed framework +can generate multi-view architectural images with consistent style and +structural coherence from shoebox model inputs. + +
+
+ comment: 10 pages, 7 figures, in Proceedings of CAADRIA2025 +
+
+
+
+
+ + ♻ ☆ NVILA: Efficient Frontier Visual Language Models + + +
+ Visual language models (VLMs) have made significant advances in accuracy in +recent years. However, their efficiency has received much less attention. This +paper introduces NVILA, a family of open VLMs designed to optimize both +efficiency and accuracy. Building on top of VILA, we improve its model +architecture by first scaling up the spatial and temporal resolutions, and then +compressing visual tokens. This "scale-then-compress" approach enables NVILA to +efficiently process high-resolution images and long videos. We also conduct a +systematic investigation to enhance the efficiency of NVILA throughout its +entire lifecycle, from training and fine-tuning to deployment. NVILA matches or +surpasses the accuracy of many leading open and proprietary VLMs across a wide +range of image and video benchmarks. At the same time, it reduces training +costs by 4.5X, fine-tuning memory usage by 3.4X, pre-filling latency by +1.6-2.2X, and decoding latency by 1.2-2.8X. We will soon make our code and +models available to facilitate reproducibility. + +
+
+
+
+
+ + ♻ ☆ Fractal Calibration for long-tailed object detection CVPR2025 + + +
+ Real-world datasets follow an imbalanced distribution, which poses +significant challenges in rare-category object detection. Recent studies tackle +this problem by developing re-weighting and re-sampling methods, that utilise +the class frequencies of the dataset. However, these techniques focus solely on +the frequency statistics and ignore the distribution of the classes in image +space, missing important information. In contrast to them, we propose FRActal +CALibration (FRACAL): a novel post-calibration method for long-tailed object +detection. FRACAL devises a logit adjustment method that utilises the fractal +dimension to estimate how uniformly classes are distributed in image space. +During inference, it uses the fractal dimension to inversely downweight the +probabilities of uniformly spaced class predictions achieving balance in two +axes: between frequent and rare categories, and between uniformly spaced and +sparsely spaced classes. FRACAL is a post-processing method and it does not +require any training, also it can be combined with many off-the-shelf models +such as one-stage sigmoid detectors and two-stage instance segmentation models. +FRACAL boosts the rare class performance by up to 8.6% and surpasses all +previous methods on LVIS dataset, while showing good generalisation to other +datasets such as COCO, V3Det and OpenImages. We provide the code at +https://github.com/kostas1515/FRACAL. + +
+
+ comment: CVPR2025 +
+
+
+
+
+ + ♻ ☆ Beyond Matryoshka: Revisiting Sparse Coding for Adaptive Representation + + +
+ Many large-scale systems rely on high-quality deep representations +(embeddings) to facilitate tasks like retrieval, search, and generative +modeling. Matryoshka Representation Learning (MRL) recently emerged as a +solution for adaptive embedding lengths, but it requires full model retraining +and suffers from noticeable performance degradations at short lengths. In this +paper, we show that sparse coding offers a compelling alternative for achieving +adaptive representation with minimal overhead and higher fidelity. We propose +Contrastive Sparse Representation (CSR), a method that sparsifies pre-trained +embeddings into a high-dimensional but selectively activated feature space. By +leveraging lightweight autoencoding and task-aware contrastive objectives, CSR +preserves semantic quality while allowing flexible, cost-effective inference at +different sparsity levels. Extensive experiments on image, text, and multimodal +benchmarks demonstrate that CSR consistently outperforms MRL in terms of both +accuracy and retrieval speed-often by large margins-while also cutting training +time to a fraction of that required by MRL. Our results establish sparse coding +as a powerful paradigm for adaptive representation learning in real-world +applications where efficiency and fidelity are both paramount. Code is +available at https://github.com/neilwen987/CSR_Adaptive_Rep + +
+
+ comment: A novel sparse coding framework designed for learning adaptive + representation +
+
+
+
+
+ + ♻ ☆ What to align in multimodal contrastive learning? ICLR 2025 + + +
+ Humans perceive the world through multisensory integration, blending the +information of different modalities to adapt their behavior. Contrastive +learning offers an appealing solution for multimodal self-supervised learning. +Indeed, by considering each modality as a different view of the same entity, it +learns to align features of different modalities in a shared representation +space. However, this approach is intrinsically limited as it only learns shared +or redundant information between modalities, while multimodal interactions can +arise in other ways. In this work, we introduce CoMM, a Contrastive MultiModal +learning strategy that enables the communication between modalities in a single +multimodal space. Instead of imposing cross- or intra- modality constraints, we +propose to align multimodal representations by maximizing the mutual +information between augmented versions of these multimodal features. Our +theoretical analysis shows that shared, synergistic and unique terms of +information naturally emerge from this formulation, allowing us to estimate +multimodal interactions beyond redundancy. We test CoMM both in a controlled +and in a series of real-world settings: in the former, we demonstrate that CoMM +effectively captures redundant, unique and synergistic information between +modalities. In the latter, CoMM learns complex multimodal interactions and +achieves state-of-the-art results on the seven multimodal benchmarks. Code is +available at https://github.com/Duplums/CoMM + +
+
+ comment: ICLR 2025, 25 pages +
+
+
+
+
+ + ♻ ☆ More than Memes: A Multimodal Topic Modeling Approach to Conspiracy + Theories on Telegram + + +
+ To address the increasing prevalence of (audio-)visual data on social media, +and to capture the evolving and dynamic nature of this communication, +researchers have begun to explore the potential of unsupervised approaches for +analyzing multimodal online content. However, existing research often neglects +visual content beyond memes, and in addition lacks methods to compare topic +models across modalities. Our study addresses these gaps by applying multimodal +topic modeling for analyzing conspiracy theories in German-language Telegram +channels. We use BERTopic with CLIP for the analysis of textual and visual data +in a corpus of ~40, 000 Telegram messages posted in October 2023 in 571 +German-language Telegram channels known for disseminating conspiracy theories. +Through this dataset, we provide insights into unimodal and multimodal topic +models by analyzing symmetry and intersections of topics across modalities. We +demonstrate the variety of textual and visual content shared in the channels +discovered through the topic modeling, and propose a conceptual framework for +the analysis of textual and visual discursive strategies in the communication +of conspiracy theories. We apply the framework in a case study of the topic +group Israel Gaza. + +
+
+ comment: 12 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Reasoning to Attend: Try to Understand How Token Works CVPR 2025 + + +
+ Current Large Multimodal Models (LMMs) empowered visual grounding typically +rely on $\texttt{}$ token as a text prompt to jointly optimize the +vision-language model (e.g., LLaVA) and the downstream task-specified model +(\eg, SAM). However, we observe that little research has looked into how it +works. In this work, we first visualize the similarity maps, which are obtained +by computing the semantic similarity between the $\texttt{}$ token and the +image token embeddings derived from the last hidden layer in both the LLaVA +encoder and SAM decoder. Intriguingly, we have found that a striking +consistency holds in terms of activation responses in the similarity map,which +reveals that what $\texttt{}$ token contributes to is the semantic +similarity within image-text pairs. Specifically, $\texttt{}$ token, a +placeholder expanded in text vocabulary, extensively queries among individual +tokenized image patches to match the semantics of an object from text to the +paired image while the Large Language Models (LLMs) are being fine-tuned. Upon +the above findings, we present READ, which facilitates LMMs' resilient +$\textbf{REA}$soning capability of where to atten$\textbf{D}$ under the +guidance of highly activated points borrowed from similarity maps. Remarkably, +READ features an intuitive design, Similarity as Points module (SasP), which +can be seamlessly applied to $\texttt{}$-like paradigms in a plug-and-play +fashion. Also, extensive experiments have been conducted on the ReasonSeg and +RefCOCO(+/g) datasets. To validate whether READ suffers from catastrophic +forgetting of previous skills after fine-tuning, we further assess its +generation ability on an augmented FP-RefCOCO(+/g) dataset. All codes and +models are publicly available at https://github.com/rui-qian/READ. + +
+
+ comment: This work has been accepted to CVPR 2025, please refer to + https://github.com/rui-qian/READ +
+
+
+
+
+ + ♻ ☆ StdGEN: Semantic-Decomposed 3D Character Generation from Single Images CVPR 2025 + + +
+ We present StdGEN, an innovative pipeline for generating semantically +decomposed high-quality 3D characters from single images, enabling broad +applications in virtual reality, gaming, and filmmaking, etc. Unlike previous +methods which struggle with limited decomposability, unsatisfactory quality, +and long optimization times, StdGEN features decomposability, effectiveness and +efficiency; i.e., it generates intricately detailed 3D characters with +separated semantic components such as the body, clothes, and hair, in three +minutes. At the core of StdGEN is our proposed Semantic-aware Large +Reconstruction Model (S-LRM), a transformer-based generalizable model that +jointly reconstructs geometry, color and semantics from multi-view images in a +feed-forward manner. A differentiable multi-layer semantic surface extraction +scheme is introduced to acquire meshes from hybrid implicit fields +reconstructed by our S-LRM. Additionally, a specialized efficient multi-view +diffusion model and an iterative multi-layer surface refinement module are +integrated into the pipeline to facilitate high-quality, decomposable 3D +character generation. Extensive experiments demonstrate our state-of-the-art +performance in 3D anime character generation, surpassing existing baselines by +a significant margin in geometry, texture and decomposability. StdGEN offers +ready-to-use semantic-decomposed 3D characters and enables flexible +customization for a wide range of applications. Project page: +https://stdgen.github.io + +
+
+ comment: CVPR 2025. 13 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Unleashing HyDRa: Hybrid Fusion, Depth Consistency and Radar for Unified + 3D Perception + + +
+ Low-cost, vision-centric 3D perception systems for autonomous driving have +made significant progress in recent years, narrowing the gap to expensive +LiDAR-based methods. The primary challenge in becoming a fully reliable +alternative lies in robust depth prediction capabilities, as camera-based +systems struggle with long detection ranges and adverse lighting and weather +conditions. In this work, we introduce HyDRa, a novel camera-radar fusion +architecture for diverse 3D perception tasks. Building upon the principles of +dense BEV (Bird's Eye View)-based architectures, HyDRa introduces a hybrid +fusion approach to combine the strengths of complementary camera and radar +features in two distinct representation spaces. Our Height Association +Transformer module leverages radar features already in the perspective view to +produce more robust and accurate depth predictions. In the BEV, we refine the +initial sparse representation by a Radar-weighted Depth Consistency. HyDRa +achieves a new state-of-the-art for camera-radar fusion of 64.2 NDS (+1.8) and +58.4 AMOTA (+1.5) on the public nuScenes dataset. Moreover, our new +semantically rich and spatially accurate BEV features can be directly converted +into a powerful occupancy representation, beating all previous camera-based +methods on the Occ3D benchmark by an impressive 3.7 mIoU. Code and models are +available at https://github.com/phi-wol/hydra. + +
+
+ comment: 10 pages, 7 figures, added eval on VoD, added appendix +
+
+
+
+
+ + ♻ ☆ On the Utility of Equivariance and Symmetry Breaking in Deep Learning + Architectures on Point Clouds + + +
+ This paper explores the key factors that influence the performance of models +working with point clouds, across different tasks of varying geometric +complexity. In this work, we explore the trade-offs between flexibility and +weight-sharing introduced by equivariant layers, assessing when equivariance +boosts or detracts from performance. It is often argued that providing more +information as input improves a model's performance. However, if this +additional information breaks certain properties, such as $\SE(3)$ +equivariance, does it remain beneficial? We identify the key aspects of +equivariant and non-equivariant architectures that drive success in different +tasks by benchmarking them on segmentation, regression, and generation tasks +across multiple datasets with increasing complexity. We observe a positive +impact of equivariance, which becomes more pronounced with increasing task +complexity, even when strict equivariance is not required. + +
+
+ comment: 19 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Human-in-the-loop Reasoning For Traffic Sign Detection: Collaborative + Approach Yolo With Video-llava + + +
+ Traffic Sign Recognition (TSR) detection is a crucial component of autonomous +vehicles. While You Only Look Once (YOLO) is a popular real-time object +detection algorithm, factors like training data quality and adverse weather +conditions (e.g., heavy rain) can lead to detection failures. These failures +can be particularly dangerous when visual similarities between objects exist, +such as mistaking a 30 km/h sign for a higher speed limit sign. This paper +proposes a method that combines video analysis and reasoning, prompting with a +human-in-the-loop guide large vision model to improve YOLOs accuracy in +detecting road speed limit signs, especially in semi-real-world conditions. It +is hypothesized that the guided prompting and reasoning abilities of +Video-LLava can enhance YOLOs traffic sign detection capabilities. This +hypothesis is supported by an evaluation based on human-annotated accuracy +metrics within a dataset of recorded videos from the CARLA car simulator. The +results demonstrate that a collaborative approach combining YOLO with +Video-LLava and reasoning can effectively address challenging situations such +as heavy rain and overcast conditions that hinder YOLOs detection capabilities. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Tiny Robotics Dataset and Benchmark for Continual Object Detection + + +
+ Detecting objects in mobile robotics is crucial for numerous applications, +from autonomous navigation to inspection. However, robots often need to operate +in different domains from those they were trained in, requiring them to adjust +to these changes. Tiny mobile robots, subject to size, power, and computational +constraints, encounter even more difficulties in running and adapting these +algorithms. Such adaptability, though, is crucial for real-world deployment, +where robots must operate effectively in dynamic and unpredictable settings. In +this work, we introduce a novel benchmark to evaluate the continual learning +capabilities of object detection systems in tiny robotic platforms. Our +contributions include: (i) Tiny Robotics Object Detection~(TiROD), a +comprehensive dataset collected using the onboard camera of a small mobile +robot, designed to test object detectors across various domains and classes; +(ii) a benchmark of different continual learning strategies on this dataset +using NanoDet, a lightweight object detector. Our results highlight key +challenges in developing robust and efficient continual learning strategies for +object detectors in tiny robotics. + +
+
+
+
+
+ + ♻ ☆ Safety Without Semantic Disruptions: Editing-free Safe Image Generation + via Context-preserving Dual Latent Reconstruction + + +
+ Training multimodal generative models on large, uncurated datasets can result +in users being exposed to harmful, unsafe and controversial or +culturally-inappropriate outputs. While model editing has been proposed to +remove or filter undesirable concepts in embedding and latent spaces, it can +inadvertently damage learned manifolds, distorting concepts in close semantic +proximity. We identify limitations in current model editing techniques, showing +that even benign, proximal concepts may become misaligned. To address the need +for safe content generation, we leverage safe embeddings and a modified +diffusion process with tunable weighted summation in the latent space to +generate safer images. Our method preserves global context without compromising +the structural integrity of the learned manifolds. We achieve state-of-the-art +results on safe image generation benchmarks and offer intuitive control over +the level of model safety. We identify trade-offs between safety and +censorship, which presents a necessary perspective in the development of +ethical AI models. We will release our code. + Keywords: Text-to-Image Models, Generative AI, Safety, Reliability, Model +Editing + +
+
+ comment: This research is supported by the NISDRG project #20100007, funded by + the Australian Government +
+
+
+
+
+ + ♻ ☆ VideoWorld: Exploring Knowledge Learning from Unlabeled Videos + + +
+ This work explores whether a deep generative model can learn complex +knowledge solely from visual input, in contrast to the prevalent focus on +text-based models like large language models (LLMs). We develop VideoWorld, an +auto-regressive video generation model trained on unlabeled video data, and +test its knowledge acquisition abilities in video-based Go and robotic control +tasks. Our experiments reveal two key findings: (1) video-only training +provides sufficient information for learning knowledge, including rules, +reasoning and planning capabilities, and (2) the representation of visual +change is crucial for knowledge acquisition. To improve both the efficiency and +efficacy of this process, we introduce the Latent Dynamics Model (LDM) as a key +component of VideoWorld. Remarkably, VideoWorld reaches a 5-dan professional +level in the Video-GoBench with just a 300-million-parameter model, without +relying on search algorithms or reward mechanisms typical in reinforcement +learning. In robotic tasks, VideoWorld effectively learns diverse control +operations and generalizes across environments, approaching the performance of +oracle models in CALVIN and RLBench. This study opens new avenues for knowledge +acquisition from visual data, with all code, data, and models open-sourced for +further research. + +
+
+ comment: Code and models are released at: + https://maverickren.github.io/VideoWorld.github.io/ +
+
+
+
+
+ + ♻ ☆ Perceptual Multi-Exposure Fusion + + +
+ As an ever-increasing demand for high dynamic range (HDR) scene shooting, +multi-exposure image fusion (MEF) technology has abounded. In recent years, +multi-scale exposure fusion approaches based on detail-enhancement have led the +way for improvement in highlight and shadow details. Most of such methods, +however, are too computationally expensive to be deployed on mobile devices. +This paper presents a perceptual multi-exposure fusion method that not just +ensures fine shadow/highlight details but with lower complexity than +detailenhanced methods. We analyze the potential defects of three classical +exposure measures in lieu of using detail-enhancement component and improve two +of them, namely adaptive Wellexposedness (AWE) and the gradient of color images +(3-D gradient). AWE designed in YCbCr color space considers the difference +between varying exposure images. 3-D gradient is employed to extract fine +details. We build a large-scale multiexposure benchmark dataset suitable for +static scenes, which contains 167 image sequences all told. Experiments on the +constructed dataset demonstrate that the proposed method exceeds existing eight +state-of-the-art approaches in terms of visually and MEF-SSIM value. Moreover, +our approach can achieve a better improvement for current image enhancement +techniques, ensuring fine detail in bright light. + +
+
+
+
+
+ + ♻ ☆ DFREC: DeepFake Identity Recovery Based on Identity-aware Masked + Autoencoder + + +
+ Recent advances in deepfake forensics have primarily focused on improving the +classification accuracy and generalization performance. Despite enormous +progress in detection accuracy across a wide variety of forgery algorithms, +existing algorithms lack intuitive interpretability and identity traceability +to help with forensic investigation. In this paper, we introduce a novel +DeepFake Identity Recovery scheme (DFREC) to fill this gap. DFREC aims to +recover the pair of source and target faces from a deepfake image to facilitate +deepfake identity tracing and reduce the risk of deepfake attack. It comprises +three key components: an Identity Segmentation Module (ISM), a Source Identity +Reconstruction Module (SIRM), and a Target Identity Reconstruction Module +(TIRM). The ISM segments the input face into distinct source and target face +information, and the SIRM reconstructs the source face and extracts latent +target identity features with the segmented source information. The background +context and latent target identity features are synergetically fused by a +Masked Autoencoder in the TIRM to reconstruct the target face. We evaluate +DFREC on six different high-fidelity face-swapping attacks on FaceForensics++, +CelebaMegaFS and FFHQ-E4S datasets, which demonstrate its superior recovery +performance over state-of-the-art deepfake recovery algorithms. In addition, +DFREC is the only scheme that can recover both pristine source and target faces +directly from the forgery image with high fadelity. + +
+
+
+
+
+ + ♻ ☆ Deblur-Avatar: Animatable Avatars from Motion-Blurred Monocular Videos + + +
+ We introduce a novel framework for modeling high-fidelity, animatable 3D +human avatars from motion-blurred monocular video inputs. Motion blur is +prevalent in real-world dynamic video capture, especially due to human +movements in 3D human avatar modeling. Existing methods either (1) assume sharp +image inputs, failing to address the detail loss introduced by motion blur, or +(2) mainly consider blur by camera movements, neglecting the human motion blur +which is more common in animatable avatars. Our proposed approach integrates a +human movement-based motion blur model into 3D Gaussian Splatting (3DGS). By +explicitly modeling human motion trajectories during exposure time, we jointly +optimize the trajectories and 3D Gaussians to reconstruct sharp, high-quality +human avatars. We employ a pose-dependent fusion mechanism to distinguish +moving body regions, optimizing both blurred and sharp areas effectively. +Extensive experiments on synthetic and real-world datasets demonstrate that our +method significantly outperforms existing methods in rendering quality and +quantitative metrics, producing sharp avatar reconstructions and enabling +real-time rendering under challenging motion blur conditions. + +
+
+
+
+
+ + ♻ ☆ BHViT: Binarized Hybrid Vision Transformer CVPR2025 + + +
+ Model binarization has made significant progress in enabling real-time and +energy-efficient computation for convolutional neural networks (CNN), offering +a potential solution to the deployment challenges faced by Vision Transformers +(ViTs) on edge devices. However, due to the structural differences between CNN +and Transformer architectures, simply applying binary CNN strategies to the ViT +models will lead to a significant performance drop. To tackle this challenge, +we propose BHViT, a binarization-friendly hybrid ViT architecture and its full +binarization model with the guidance of three important observations. +Initially, BHViT utilizes the local information interaction and hierarchical +feature aggregation technique from coarse to fine levels to address redundant +computations stemming from excessive tokens. Then, a novel module based on +shift operations is proposed to enhance the performance of the binary +Multilayer Perceptron (MLP) module without significantly increasing +computational overhead. In addition, an innovative attention matrix +binarization method based on quantization decomposition is proposed to evaluate +the token's importance in the binarized attention matrix. Finally, we propose a +regularization loss to address the inadequate optimization caused by the +incompatibility between the weight oscillation in the binary layers and the +Adam Optimizer. Extensive experimental results demonstrate that our proposed +algorithm achieves SOTA performance among binary ViT methods. + +
+
+ comment: Accepted by CVPR2025 +
+
+
+
+
+ + ♻ ☆ LDPM: Towards undersampled MRI reconstruction with MR-VAE and Latent + Diffusion Prior + + +
+ Diffusion models, as powerful generative models, have found a wide range of +applications and shown great potential in solving image reconstruction +problems. Some works attempted to solve MRI reconstruction with diffusion +models, but these methods operate directly in pixel space, leading to higher +computational costs for optimization and inference. Latent diffusion models, +pre-trained on natural images with rich visual priors, are expected to solve +the high computational cost problem in MRI reconstruction by operating in a +lower-dimensional latent space. However, direct application to MRI +reconstruction faces three key challenges: (1) absence of explicit control +mechanisms for medical fidelity, (2) domain gap between natural images and MR +physics, and (3) undefined data consistency in latent space. To address these +challenges, a novel Latent Diffusion Prior-based undersampled MRI +reconstruction (LDPM) method is proposed. Our LDPM framework addresses these +challenges by: (1) a sketch-guided pipeline with a two-step reconstruction +strategy, which balances perceptual quality and anatomical fidelity, (2) an +MRI-optimized VAE (MR-VAE), which achieves an improvement of approximately 3.92 +dB in PSNR for undersampled MRI reconstruction compared to that with SD-VAE +\cite{sd}, and (3) Dual-Stage Sampler, a modified version of spaced DDPM +sampler, which enforces high-fidelity reconstruction in the latent space. +Experiments on the fastMRI dataset\cite{fastmri} demonstrate the +state-of-the-art performance of the proposed method and its robustness across +various scenarios. The effectiveness of each module is also verified through +ablation experiments. + +
+
+
+
+
+ + ♻ ☆ GSplatLoc: Grounding Keypoint Descriptors into 3D Gaussian Splatting for + Improved Visual Localization + + +
+ Although various visual localization approaches exist, such as scene +coordinate regression and camera pose regression, these methods often struggle +with optimization complexity or limited accuracy. To address these challenges, +we explore the use of novel view synthesis techniques, particularly 3D Gaussian +Splatting (3DGS), which enables the compact encoding of both 3D geometry and +scene appearance. We propose a two-stage procedure that integrates dense and +robust keypoint descriptors from the lightweight XFeat feature extractor into +3DGS, enhancing performance in both indoor and outdoor environments. The coarse +pose estimates are directly obtained via 2D-3D correspondences between the 3DGS +representation and query image descriptors. In the second stage, the initial +pose estimate is refined by minimizing the rendering-based photometric warp +loss. Benchmarking on widely used indoor and outdoor datasets demonstrates +improvements over recent neural rendering-based localization methods, such as +NeRFMatch and PNeRFLoc. + +
+
+ comment: Project website at https://gsplatloc.github.io/ +
+
+
+
+
+ + ♻ ☆ ArtNVG: Content-Style Separated Artistic Neighboring-View Gaussian + Stylization + + +
+ As demand from the film and gaming industries for 3D scenes with target +styles grows, the importance of advanced 3D stylization techniques increases. +However, recent methods often struggle to maintain local consistency in color +and texture throughout stylized scenes, which is essential for maintaining +aesthetic coherence. To solve this problem, this paper introduces ArtNVG, an +innovative 3D stylization framework that efficiently generates stylized 3D +scenes by leveraging reference style images. Built on 3D Gaussian Splatting +(3DGS), ArtNVG achieves rapid optimization and rendering while upholding high +reconstruction quality. Our framework realizes high-quality 3D stylization by +incorporating two pivotal techniques: Content-Style Separated Control and +Attention-based Neighboring-View Alignment. Content-Style Separated Control +uses the CSGO model and the Tile ControlNet to decouple the content and style +control, reducing risks of information leakage. Concurrently, Attention-based +Neighboring-View Alignment ensures consistency of local colors and textures +across neighboring views, significantly improving visual quality. Extensive +experiments validate that ArtNVG surpasses existing methods, delivering +superior results in content preservation, style alignment, and local +consistency. + +
+
+
+
+
+ + ♻ ☆ Multimodal Action Quality Assessment + + +
+ Action quality assessment (AQA) is to assess how well an action is performed. +Previous works perform modelling by only the use of visual information, +ignoring audio information. We argue that although AQA is highly dependent on +visual information, the audio is useful complementary information for improving +the score regression accuracy, especially for sports with background music, +such as figure skating and rhythmic gymnastics. To leverage multimodal +information for AQA, i.e., RGB, optical flow and audio information, we propose +a Progressive Adaptive Multimodal Fusion Network (PAMFN) that separately models +modality-specific information and mixed-modality information. Our model +consists of with three modality-specific branches that independently explore +modality-specific information and a mixed-modality branch that progressively +aggregates the modality-specific information from the modality-specific +branches. To build the bridge between modality-specific branches and the +mixed-modality branch, three novel modules are proposed. First, a +Modality-specific Feature Decoder module is designed to selectively transfer +modality-specific information to the mixed-modality branch. Second, when +exploring the interaction between modality-specific information, we argue that +using an invariant multimodal fusion policy may lead to suboptimal results, so +as to take the potential diversity in different parts of an action into +consideration. Therefore, an Adaptive Fusion Module is proposed to learn +adaptive multimodal fusion policies in different parts of an action. This +module consists of several FusionNets for exploring different multimodal fusion +strategies and a PolicyNet for deciding which FusionNets are enabled. Third, a +module called Cross-modal Feature Decoder is designed to transfer cross-modal +features generated by Adaptive Fusion Module to the mixed-modality branch. + +
+
+ comment: IEEE Transactions on Image Processing 2024 +
+
+
+
+
+ + ♻ ☆ MVP-Shot: Multi-Velocity Progressive-Alignment Framework for Few-Shot + Action Recognition + + +
+ Recent few-shot action recognition (FSAR) methods typically perform semantic +matching on learned discriminative features to achieve promising performance. +However, most FSAR methods focus on single-scale (e.g., frame-level, +segment-level, etc) feature alignment, which ignores that human actions with +the same semantic may appear at different velocities. To this end, we develop a +novel Multi-Velocity Progressive-alignment (MVP-Shot) framework to +progressively learn and align semantic-related action features at +multi-velocity levels. Concretely, a Multi-Velocity Feature Alignment (MVFA) +module is designed to measure the similarity between features from support and +query videos with different velocity scales and then merge all similarity +scores in a residual fashion. To avoid the multiple velocity features deviating +from the underlying motion semantic, our proposed Progressive Semantic-Tailored +Interaction (PSTI) module injects velocity-tailored text information into the +video feature via feature interaction on channel and temporal domains at +different velocities. The above two modules compensate for each other to make +more accurate query sample predictions under the few-shot settings. +Experimental results show our method outperforms current state-of-the-art +methods on multiple standard few-shot benchmarks (i.e., HMDB51, UCF101, +Kinetics, and SSv2-small). + +
+
+ comment: Accepted to TMM 2025 +
+
+
+
+
+ + ♻ ☆ ChartX & ChartVLM: A Versatile Benchmark and Foundation Model for + Complicated Chart Reasoning + + +
+ Recently, many versatile Multi-modal Large Language Models (MLLMs) have +emerged continuously. However, their capacity to query information depicted in +visual charts and engage in reasoning based on the queried contents remains +under-explored. In this paper, to comprehensively and rigorously benchmark the +ability of the off-the-shelf MLLMs in the chart domain, we construct ChartX, a +multi-modal evaluation set covering 18 chart types, 7 chart tasks, 22 +disciplinary topics, and high-quality chart data. Besides, we develop ChartVLM +to offer a new perspective on handling multi-modal tasks that strongly depend +on interpretable patterns, such as reasoning tasks in the field of charts or +geometric images. We evaluate the chart-related ability of mainstream MLLMs and +our ChartVLM on the proposed ChartX evaluation set. Extensive experiments +demonstrate that ChartVLM surpasses both versatile and chart-related large +models, achieving results comparable to GPT-4V. We believe that our study can +pave the way for further exploration in creating a more comprehensive chart +evaluation set and developing more interpretable multi-modal models. Both +ChartX and ChartVLM are available at: +https://github.com/Alpha-Innovator/ChartVLM + +
+
+ comment: Code and dataset are available for downloading at: + https://github.com/Alpha-Innovator/ChartVLM 26 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ LAPTOP-Diff: Layer Pruning and Normalized Distillation for Compressing + Diffusion Models + + +
+ In the era of AIGC, the demand for low-budget or even on-device applications +of diffusion models emerged. In terms of compressing the Stable Diffusion +models (SDMs), several approaches have been proposed, and most of them +leveraged the handcrafted layer removal methods to obtain smaller U-Nets, along +with knowledge distillation to recover the network performance. However, such a +handcrafting manner of layer removal is inefficient and lacks scalability and +generalization, and the feature distillation employed in the retraining phase +faces an imbalance issue that a few numerically significant feature loss terms +dominate over others throughout the retraining process. To this end, we +proposed the layer pruning and normalized distillation for compressing +diffusion models (LAPTOP-Diff). We, 1) introduced the layer pruning method to +compress SDM's U-Net automatically and proposed an effective one-shot pruning +criterion whose one-shot performance is guaranteed by its good additivity +property, surpassing other layer pruning and handcrafted layer removal methods, +2) proposed the normalized feature distillation for retraining, alleviated the +imbalance issue. Using the proposed LAPTOP-Diff, we compressed the U-Nets of +SDXL and SDM-v1.5 for the most advanced performance, achieving a minimal 4.0% +decline in PickScore at a pruning ratio of 50% while the comparative methods' +minimal PickScore decline is 8.2%. + +
+
+
+
+
+ + ♻ ☆ Handling Spatial-Temporal Data Heterogeneity for Federated Continual + Learning via Tail Anchor CVPR 2025 + + +
+ Federated continual learning (FCL) allows each client to continually update +its knowledge from task streams, enhancing the applicability of federated +learning in real-world scenarios. However, FCL needs to address not only +spatial data heterogeneity between clients but also temporal data heterogeneity +between tasks. In this paper, empirical experiments demonstrate that such +input-level heterogeneity significantly affects the model's internal parameters +and outputs, leading to severe spatial-temporal catastrophic forgetting of +local and previous knowledge. To this end, we propose Federated Tail Anchor +(FedTA) to mix trainable Tail Anchor with the frozen output features to adjust +their position in the feature space, thereby overcoming parameter-forgetting +and output-forgetting. Three novel components are also included: Input +Enhancement for improving the performance of pre-trained models on downstream +tasks; Selective Input Knowledge Fusion for fusion of heterogeneous local +knowledge on the server; and Best Global Prototype Selection for finding the +best anchor point for each class in the feature space. Extensive experiments +demonstrate that FedTA not only outperforms existing FCL methods but also +effectively preserves the relative positions of features. + +
+
+ comment: This paper is accepted by CVPR 2025 +
+
+
+
+
+ + ♻ ☆ Sim2Real within 5 Minutes: Efficient Domain Transfer with Stylized + Gaussian Splatting for Endoscopic Images ICRA 2025 + + +
+ Robot assisted endoluminal intervention is an emerging technique for both +benign and malignant luminal lesions. With vision-based navigation, when +combined with pre-operative imaging data as priors, it is possible to recover +position and pose of the endoscope without the need of additional sensors. In +practice, however, aligning pre-operative and intra-operative domains is +complicated by significant texture differences. Although methods such as style +transfer can be used to address this issue, they require large datasets from +both source and target domains with prolonged training times. This paper +proposes an efficient domain transfer method based on stylized Gaussian +splatting, only requiring a few of real images (10 images) with very fast +training time. Specifically, the transfer process includes two phases. In the +first phase, the 3D models reconstructed from CT scans are represented as +differential Gaussian point clouds. In the second phase, only color appearance +related parameters are optimized to transfer the style and preserve the visual +content. A novel structure consistency loss is applied to latent features and +depth levels to enhance the stability of the transferred images. Detailed +validation was performed to demonstrate the performance advantages of the +proposed method compared to that of the current state-of-the-art, highlighting +the potential for intra-operative surgical navigation. + +
+
+ comment: Accepted by ICRA 2025 +
+
+
+
+
+ + ♻ ☆ A Physical Coherence Benchmark for Evaluating Video Generation Models + via Optical Flow-guided Frame Prediction + + +
+ Recent advances in video generation models demonstrate their potential as +world simulators, but they often struggle with videos deviating from physical +laws, a key concern overlooked by most text-to-video benchmarks. We introduce a +benchmark designed specifically to assess the Physical Coherence of generated +videos, PhyCoBench. Our benchmark includes 120 prompts covering 7 categories of +physical principles, capturing key physical laws observable in video content. +We evaluated four state-of-the-art (SoTA) T2V models on PhyCoBench and +conducted manual assessments. Additionally, we propose an automated evaluation +model: PhyCoPredictor, a diffusion model that generates optical flow and video +frames in a cascade manner. Through a consistency evaluation comparing +automated and manual sorting, the experimental results show that PhyCoPredictor +currently aligns most closely with human evaluation. Therefore, it can +effectively evaluate the physical coherence of videos, providing insights for +future model optimization. Our benchmark, including physical coherence prompts, +the automatic evaluation tool PhyCoPredictor, and the generated video dataset, +has been released on GitHub at https://github.com/Jeckinchen/PhyCoBench. + +
+
+
+
+
+ + ♻ ☆ Counting Guidance for High Fidelity Text-to-Image Synthesis WACV 2025 + + +
+ Recently, there have been significant improvements in the quality and +performance of text-to-image generation, largely due to the impressive results +attained by diffusion models. However, text-to-image diffusion models sometimes +struggle to create high-fidelity content for the given input prompt. One +specific issue is their difficulty in generating the precise number of objects +specified in the text prompt. For example, when provided with the prompt "five +apples and ten lemons on a table," images generated by diffusion models often +contain an incorrect number of objects. In this paper, we present a method to +improve diffusion models so that they accurately produce the correct object +count based on the input prompt. We adopt a counting network that performs +reference-less class-agnostic counting for any given image. We calculate the +gradients of the counting network and refine the predicted noise for each step. +To address the presence of multiple types of objects in the prompt, we utilize +novel attention map guidance to obtain high-quality masks for each object. +Finally, we guide the denoising process using the calculated gradients for each +object. Through extensive experiments and evaluation, we demonstrate that the +proposed method significantly enhances the fidelity of diffusion models with +respect to object count. Code is available at +https://github.com/furiosa-ai/counting-guidance. + +
+
+ comment: Accepted at WACV 2025 (Oral). Code is available at + https://github.com/furiosa-ai/counting-guidance +
+
+
+
+
+ + ♻ HunyuanVideo: A Systematic Framework For Large Video Generative Models + + +
+ Recent advancements in video generation have significantly impacted daily +life for both individuals and industries. However, the leading video generation +models remain closed-source, resulting in a notable performance gap between +industry capabilities and those available to the public. In this report, we +introduce HunyuanVideo, an innovative open-source video foundation model that +demonstrates performance in video generation comparable to, or even surpassing, +that of leading closed-source models. HunyuanVideo encompasses a comprehensive +framework that integrates several key elements, including data curation, +advanced architectural design, progressive model scaling and training, and an +efficient infrastructure tailored for large-scale model training and inference. +As a result, we successfully trained a video generative model with over 13 +billion parameters, making it the largest among all open-source models. We +conducted extensive experiments and implemented a series of targeted designs to +ensure high visual quality, motion dynamics, text-video alignment, and advanced +filming techniques. According to evaluations by professionals, HunyuanVideo +outperforms previous state-of-the-art models, including Runway Gen-3, Luma 1.6, +and three top-performing Chinese video generative models. By releasing the code +for the foundation model and its applications, we aim to bridge the gap between +closed-source and open-source communities. This initiative will empower +individuals within the community to experiment with their ideas, fostering a +more dynamic and vibrant video generation ecosystem. The code is publicly +available at https://github.com/Tencent/HunyuanVideo. + +
+
+
+
+
+ + ♻ ☆ SCott: Accelerating Diffusion Models with Stochastic Consistency + Distillation + + +
+ The iterative sampling procedure employed by diffusion models (DMs) often +leads to significant inference latency. To address this, we propose Stochastic +Consistency Distillation (SCott) to enable accelerated text-to-image +generation, where high-quality and diverse generations can be achieved within +just 2-4 sampling steps. In contrast to vanilla consistency distillation (CD) +which distills the ordinary differential equation solvers-based sampling +process of a pre-trained teacher model into a student, SCott explores the +possibility and validates the efficacy of integrating stochastic differential +equation (SDE) solvers into CD to fully unleash the potential of the teacher. +SCott is augmented with elaborate strategies to control the noise strength and +sampling process of the SDE solver. An adversarial loss is further incorporated +to strengthen the consistency constraints in rare sampling steps. Empirically, +on the MSCOCO-2017 5K dataset with a Stable Diffusion-V1.5 teacher, SCott +achieves an FID of 21.9 with 2 sampling steps, surpassing that of the 1-step +InstaFlow (23.4) and the 4-step UFOGen (22.1). Moreover, SCott can yield more +diverse samples than other consistency models for high-resolution image +generation, with up to 16% improvement in a qualified metric. + +
+
+ comment: 22 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ Schedule On the Fly: Diffusion Time Prediction for Faster and Better + Image Generation + + +
+ Diffusion and flow matching models have achieved remarkable success in +text-to-image generation. However, these models typically rely on the +predetermined denoising schedules for all prompts. The multi-step reverse +diffusion process can be regarded as a kind of chain-of-thought for generating +high-quality images step by step. Therefore, diffusion models should reason for +each instance to adaptively determine the optimal noise schedule, achieving +high generation quality with sampling efficiency. In this paper, we introduce +the Time Prediction Diffusion Model (TPDM) for this. TPDM employs a +plug-and-play Time Prediction Module (TPM) that predicts the next noise level +based on current latent features at each denoising step. We train the TPM using +reinforcement learning to maximize a reward that encourages high final image +quality while penalizing excessive denoising steps. With such an adaptive +scheduler, TPDM not only generates high-quality images that are aligned closely +with human preferences but also adjusts diffusion time and the number of +denoising steps on the fly, enhancing both performance and efficiency. With +Stable Diffusion 3 Medium architecture, TPDM achieves an aesthetic score of +5.44 and a human preference score (HPS) of 29.59, while using around 50% fewer +denoising steps to achieve better performance. + +
+
+
+
+
+ + ♻ ☆ Explaining Vision-Language Similarities in Dual Encoders with + Feature-Pair Attributions + + +
+ Dual encoder architectures like CLIP models map two types of inputs into a +shared embedding space and predict similarities between them. Despite their +success, it is, however, not understood how these models compare their two +inputs. Common first-order feature-attribution methods can only provide limited +insights into dual-encoders since their predictions depend on +feature-interactions rather than on individual features. In this paper, we +first derive a second-order method enabling the attribution of predictions by +any differentiable dual encoder onto feature-interactions between its inputs. +Second, we apply our method to CLIP models and show that they learn +fine-grained correspondences between parts of captions and regions in images. +They match objects across input modes also account for mismatches. This +visual-linguistic grounding ability, however, varies heavily between object +classes and exhibits pronounced out-of-domain effects. We can identify +individual errors as well as systematic failure categories including object +coverage, unusual scenes and correlated contexts. + +
+
+
+
+
+ + ♻ ☆ Super-Resolution on Rotationally Scanned Photoacoustic Microscopy Images + Incorporating Scanning Prior + + +
+ Photoacoustic Microscopy (PAM) images integrating the advantages of optical +contrast and acoustic resolution have been widely used in brain studies. +However, there exists a trade-off between scanning speed and image resolution. +Compared with traditional raster scanning, rotational scanning provides good +opportunities for fast PAM imaging by optimizing the scanning mechanism. +Recently, there is a trend to incorporate deep learning into the scanning +process to further increase the scanning speed.Yet, most such attempts are +performed for raster scanning while those for rotational scanning are +relatively rare. In this study, we propose a novel and well-performing +super-resolution framework for rotational scanning-based PAM imaging. To +eliminate adjacent rows' displacements due to subject motion or high-frequency +scanning distortion,we introduce a registration module across odd and even rows +in the preprocessing and incorporate displacement degradation in the training. +Besides, gradient-based patch selection is proposed to increase the probability +of blood vessel patches being selected for training. A Transformer-based +network with a global receptive field is applied for better performance. +Experimental results on both synthetic and real datasets demonstrate the +effectiveness and generalizability of our proposed framework for rotationally +scanned PAM images'super-resolution, both quantitatively and qualitatively. +Code is available at https://github.com/11710615/PAMSR.git. + +
+
+
+
+
+ + ♻ ☆ XLSTM-HVED: Cross-Modal Brain Tumor Segmentation and MRI Reconstruction + Method Using Vision XLSTM and Heteromodal Variational Encoder-Decoder + + +
+ Neurogliomas are among the most aggressive forms of cancer, presenting +considerable challenges in both treatment and monitoring due to their +unpredictable biological behavior. Magnetic resonance imaging (MRI) is +currently the preferred method for diagnosing and monitoring gliomas. However, +the lack of specific imaging techniques often compromises the accuracy of tumor +segmentation during the imaging process. To address this issue, we introduce +the XLSTM-HVED model. This model integrates a hetero-modal encoder-decoder +framework with the Vision XLSTM module to reconstruct missing MRI modalities. +By deeply fusing spatial and temporal features, it enhances tumor segmentation +performance. The key innovation of our approach is the Self-Attention +Variational Encoder (SAVE) module, which improves the integration of modal +features. Additionally, it optimizes the interaction of features between +segmentation and reconstruction tasks through the Squeeze-Fusion-Excitation +Cross Awareness (SFECA) module. Our experiments using the BraTS 2024 dataset +demonstrate that our model significantly outperforms existing advanced methods +in handling cases where modalities are missing. Our source code is available at +https://github.com/Quanato607/XLSTM-HVED. + +
+
+ comment: 5 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ 3DGS.zip: A survey on 3D Gaussian Splatting Compression Methods + + +
+ 3D Gaussian Splatting (3DGS) has emerged as a cutting-edge technique for +real-time radiance field rendering, offering state-of-the-art performance in +terms of both quality and speed. 3DGS models a scene as a collection of +three-dimensional Gaussians, with additional attributes optimized to conform to +the scene's geometric and visual properties. Despite its advantages in +rendering speed and image fidelity, 3DGS is limited by its significant storage +and memory demands. These high demands make 3DGS impractical for mobile devices +or headsets, reducing its applicability in important areas of computer +graphics. To address these challenges and advance the practicality of 3DGS, +this survey provides a comprehensive and detailed examination of compression +and compaction techniques developed to make 3DGS more efficient. We classify +existing methods into two categories: compression, which focuses on reducing +file size, and compaction, which aims to minimize the number of Gaussians. Both +methods aim to maintain or improve quality, each by minimizing its respective +attribute: file size for compression and Gaussian count for compaction. We +introduce the basic mathematical concepts underlying the analyzed methods, as +well as key implementation details and design choices. Our report thoroughly +discusses similarities and differences among the methods, as well as their +respective advantages and disadvantages. We establish a consistent framework +for comparing the surveyed methods based on key performance metrics and +datasets. Specifically, since these methods have been developed in parallel and +over a short period of time, currently, no comprehensive comparison exists. +This survey, for the first time, presents a unified framework to evaluate 3DGS +compression techniques. We maintain a website that will be regularly updated +with emerging methods: https://w-m.github.io/3dgs-compression-survey/ . + +
+
+ comment: 3D Gaussian Splatting compression survey; 3DGS compression; updated + discussion; new approaches added; new illustrations +
+
+
+
+
+ + ♻ SLTNet: Efficient Event-based Semantic Segmentation with Spike-driven + Lightweight Transformer-based Networks IROS 2025 + + +
+ Event-based semantic segmentation has great potential in autonomous driving +and robotics due to the advantages of event cameras, such as high dynamic +range, low latency, and low power cost. Unfortunately, current artificial +neural network (ANN)-based segmentation methods suffer from high computational +demands, the requirements for image frames, and massive energy consumption, +limiting their efficiency and application on resource-constrained edge/mobile +platforms. To address these problems, we introduce SLTNet, a spike-driven +lightweight transformer-based network designed for event-based semantic +segmentation. Specifically, SLTNet is built on efficient spike-driven +convolution blocks (SCBs) to extract rich semantic features while reducing the +model's parameters. Then, to enhance the long-range contextural feature +interaction, we propose novel spike-driven transformer blocks (STBs) with +binary mask operations. Based on these basic blocks, SLTNet employs a +high-efficiency single-branch architecture while maintaining the low energy +consumption of the Spiking Neural Network (SNN). Finally, extensive experiments +on DDD17 and DSEC-Semantic datasets demonstrate that SLTNet outperforms +state-of-the-art (SOTA) SNN-based methods by at most 9.06% and 9.39% mIoU, +respectively, with extremely 4.58x lower energy consumption and 114 FPS +inference speed. Our code is open-sourced and available at +https://github.com/longxianlei/SLTNet-v1.0. + +
+
+ comment: Submitted to 2025 IEEE/RSJ International Conference on Intelligent + Robots and Systems (IROS 2025) +
+
+
+
+
+ + ♻ ☆ ChemVLM: Exploring the Power of Multimodal Large Language Models in + Chemistry Area + + +
+ Large Language Models (LLMs) have achieved remarkable success and have been +applied across various scientific fields, including chemistry. However, many +chemical tasks require the processing of visual information, which cannot be +successfully handled by existing chemical LLMs. This brings a growing need for +models capable of integrating multimodal information in the chemical domain. In +this paper, we introduce \textbf{ChemVLM}, an open-source chemical multimodal +large language model specifically designed for chemical applications. ChemVLM +is trained on a carefully curated bilingual multimodal dataset that enhances +its ability to understand both textual and visual chemical information, +including molecular structures, reactions, and chemistry examination questions. +We develop three datasets for comprehensive evaluation, tailored to Chemical +Optical Character Recognition (OCR), Multimodal Chemical Reasoning (MMCR), and +Multimodal Molecule Understanding tasks. We benchmark ChemVLM against a range +of open-source and proprietary multimodal large language models on various +tasks. Experimental results demonstrate that ChemVLM achieves competitive +performance across all evaluated tasks. Our model can be found at +https://huggingface.co/AI4Chem/ChemVLM-26B. + +
+
+ comment: 11 pages, updated version +
+
+
+
+
+ + ♻ ☆ Scale-Invariant Object Detection by Adaptive Convolution with Unified + Global-Local Context + + +
+ Dense features are important for detecting minute objects in images. +Unfortunately, despite the remarkable efficacy of the CNN models in multi-scale +object detection, CNN models often fail to detect smaller objects in images due +to the loss of dense features during the pooling process. Atrous convolution +addresses this issue by applying sparse kernels. However, sparse kernels often +can lose the multi-scale detection efficacy of the CNN model. In this paper, we +propose an object detection model using a Switchable (adaptive) Atrous +Convolutional Network (SAC-Net) based on the efficientDet model. A fixed atrous +rate limits the performance of the CNN models in the convolutional layers. To +overcome this limitation, we introduce a switchable mechanism that allows for +dynamically adjusting the atrous rate during the forward pass. The proposed +SAC-Net encapsulates the benefits of both low-level and high-level features to +achieve improved performance on multi-scale object detection tasks, without +losing the dense features. Further, we apply a depth-wise switchable atrous +rate to the proposed network, to improve the scale-invariant features. Finally, +we apply global context on the proposed model. Our extensive experiments on +benchmark datasets demonstrate that the proposed SAC-Net outperforms the +state-of-the-art models by a significant margin in terms of accuracy. + +
+
+
+
+
+ + ♻ ☆ Breaking Class Barriers: Efficient Dataset Distillation via Inter-Class + Feature Compensator ICLR 2025 + + +
+ Dataset distillation has emerged as a technique aiming to condense +informative features from large, natural datasets into a compact and synthetic +form. While recent advancements have refined this technique, its performance is +bottlenecked by the prevailing class-specific synthesis paradigm. Under this +paradigm, synthetic data is optimized exclusively for a pre-assigned one-hot +label, creating an implicit class barrier in feature condensation. This leads +to inefficient utilization of the distillation budget and oversight of +inter-class feature distributions, which ultimately limits the effectiveness +and efficiency, as demonstrated in our analysis. To overcome these constraints, +this paper presents the Inter-class Feature Compensator (INFER), an innovative +distillation approach that transcends the class-specific data-label framework +widely utilized in current dataset distillation methods. Specifically, INFER +leverages a Universal Feature Compensator (UFC) to enhance feature integration +across classes, enabling the generation of multiple additional synthetic +instances from a single UFC input. This significantly improves the efficiency +of the distillation budget. Moreover, INFER enriches inter-class interactions +during the distillation, thereby enhancing the effectiveness and +generalizability of the distilled data. By allowing for the linear +interpolation of labels similar to those in the original dataset, INFER +meticulously optimizes the synthetic data and dramatically reduces the size of +soft labels in the synthetic dataset to almost zero, establishing a new +benchmark for efficiency and effectiveness in dataset distillation. In +practice, INFER demonstrates state-of-the-art performance across benchmark +datasets. For instance, in the ipc = 50 setting on ImageNet-1k with the same +compression level, it outperforms SRe2L by 34.5% using ResNet18. + +
+
+ comment: Accepted to ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Floorplan-SLAM: A Real-Time, High-Accuracy, and Long-Term Multi-Session + Point-Plane SLAM for Efficient Floorplan Reconstruction + + +
+ Floorplan reconstruction provides structural priors essential for reliable +indoor robot navigation and high-level scene understanding. However, existing +approaches either require time-consuming offline processing with a complete +map, or rely on expensive sensors and substantial computational resources. To +address the problems, we propose Floorplan-SLAM, which incorporates floorplan +reconstruction tightly into a multi-session SLAM system by seamlessly +interacting with plane extraction, pose estimation, and back-end optimization, +achieving real-time, high-accuracy, and long-term floorplan reconstruction +using only a stereo camera. Specifically, we present a robust plane extraction +algorithm that operates in a compact plane parameter space and leverages +spatially complementary features to accurately detect planar structures, even +in weakly textured scenes. Furthermore, we propose a floorplan reconstruction +module tightly coupled with the SLAM system, which uses continuously optimized +plane landmarks and poses to formulate and solve a novel optimization problem, +thereby enabling real-time incremental floorplan reconstruction. Note that by +leveraging the map merging capability of multi-session SLAM, our method +supports long-term floorplan reconstruction across multiple sessions without +redundant data collection. Experiments on the VECtor and the self-collected +datasets indicate that Floorplan-SLAM significantly outperforms +state-of-the-art methods in terms of plane extraction robustness, pose +estimation accuracy, and floorplan reconstruction fidelity and speed, achieving +real-time performance at 25-45 FPS without GPU acceleration, which reduces the +floorplan reconstruction time for a 1000 square meters scene from over 10 hours +to just 9.44 minutes. + +
+
+
+
+
+ + ♻ ☆ Look, Listen, and Answer: Overcoming Biases for Audio-Visual Question + Answering NeurIPS 2024 + + +
+ Audio-Visual Question Answering (AVQA) is a complex multi-modal reasoning +task, demanding intelligent systems to accurately respond to natural language +queries based on audio-video input pairs. Nevertheless, prevalent AVQA +approaches are prone to overlearning dataset biases, resulting in poor +robustness. Furthermore, current datasets may not provide a precise diagnostic +for these methods. To tackle these challenges, firstly, we propose a novel +dataset, MUSIC-AVQA-R, crafted in two steps: rephrasing questions within the +test split of a public dataset (MUSIC-AVQA) and subsequently introducing +distribution shifts to split questions. The former leads to a large, diverse +test space, while the latter results in a comprehensive robustness evaluation +on rare, frequent, and overall questions. Secondly, we propose a robust +architecture that utilizes a multifaceted cycle collaborative debiasing +strategy to overcome bias learning. Experimental results show that this +architecture achieves state-of-the-art performance on MUSIC-AVQA-R, notably +obtaining a significant improvement of 9.32%. Extensive ablation experiments +are conducted on the two datasets mentioned to analyze the component +effectiveness within the debiasing strategy. Additionally, we highlight the +limited robustness of existing multi-modal QA methods through the evaluation on +our dataset. We also conduct experiments combining various baselines with our +proposed strategy on two datasets to verify its plug-and-play capability. Our +dataset and code are available at https://github.com/reml-group/MUSIC-AVQA-R. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Q-Eval-100K: Evaluating Visual Quality and Alignment Level for + Text-to-Vision Content CVPR 2025 + + +
+ Evaluating text-to-vision content hinges on two crucial aspects: visual +quality and alignment. While significant progress has been made in developing +objective models to assess these dimensions, the performance of such models +heavily relies on the scale and quality of human annotations. According to +Scaling Law, increasing the number of human-labeled instances follows a +predictable pattern that enhances the performance of evaluation models. +Therefore, we introduce a comprehensive dataset designed to Evaluate Visual +quality and Alignment Level for text-to-vision content (Q-EVAL-100K), featuring +the largest collection of human-labeled Mean Opinion Scores (MOS) for the +mentioned two aspects. The Q-EVAL-100K dataset encompasses both text-to-image +and text-to-video models, with 960K human annotations specifically focused on +visual quality and alignment for 100K instances (60K images and 40K videos). +Leveraging this dataset with context prompt, we propose Q-Eval-Score, a unified +model capable of evaluating both visual quality and alignment with special +improvements for handling long-text prompt alignment. Experimental results +indicate that the proposed Q-Eval-Score achieves superior performance on both +visual quality and alignment, with strong generalization capabilities across +other benchmarks. These findings highlight the significant value of the +Q-EVAL-100K dataset. Data and codes will be available at +https://github.com/zzc-1998/Q-Eval. + +
+
+ comment: Accepted to CVPR 2025 +
+
+
+
+
+ + ♻ ☆ MagicDrive-V2: High-Resolution Long Video Generation for Autonomous + Driving with Adaptive Control + + +
+ The rapid advancement of diffusion models has greatly improved video +synthesis, especially in controllable video generation, which is vital for +applications like autonomous driving. Although DiT with 3D VAE has become a +standard framework for video generation, it introduces challenges in +controllable driving video generation, especially for geometry control, +rendering existing control methods ineffective. To address these issues, we +propose MagicDrive-V2, a novel approach that integrates the MVDiT block and +spatial-temporal conditional encoding to enable multi-view video generation and +precise geometric control. Additionally, we introduce an efficient method for +obtaining contextual descriptions for videos to support diverse textual +control, along with a progressive training strategy using mixed video data to +enhance training efficiency and generalizability. Consequently, MagicDrive-V2 +enables multi-view driving video synthesis with $3.3\times$ resolution and +$4\times$ frame count (compared to current SOTA), rich contextual control, and +geometric controls. Extensive experiments demonstrate MagicDrive-V2's ability, +unlocking broader applications in autonomous driving. + +
+
+ comment: Project Website: https://flymin.github.io/magicdrive-v2/ +
+
+
+
+
+ + ♻ ☆ TimeRefine: Temporal Grounding with Time Refining Video LLM + + +
+ Video temporal grounding aims to localize relevant temporal boundaries in a +video given a textual prompt. Recent work has focused on enabling Video LLMs to +perform video temporal grounding via next-token prediction of temporal +timestamps. However, accurately localizing timestamps in videos remains +challenging for Video LLMs when relying solely on temporal token prediction. +Our proposed TimeRefine addresses this challenge in two ways. First, instead of +directly predicting the start and end timestamps, we reformulate the temporal +grounding task as a temporal refining task: the model first makes rough +predictions and then refines them by predicting offsets to the target segment. +This refining process is repeated multiple times, through which the model +progressively self-improves its temporal localization accuracy. Second, to +enhance the model's temporal perception capabilities, we incorporate an +auxiliary prediction head that penalizes the model more if a predicted segment +deviates further from the ground truth, thus encouraging the model to make +closer and more accurate predictions. Our plug-and-play method can be +integrated into most LLM-based temporal grounding approaches. The experimental +results demonstrate that TimeRefine achieves 3.6% and 5.0% mIoU improvements on +the ActivityNet and Charades-STA datasets, respectively. Code and pretrained +models will be released. + +
+
+
+
+
+ + ♻ ☆ CarPlanner: Consistent Auto-regressive Trajectory Planning for + Large-scale Reinforcement Learning in Autonomous Driving CVPR 2025 + + +
+ Trajectory planning is vital for autonomous driving, ensuring safe and +efficient navigation in complex environments. While recent learning-based +methods, particularly reinforcement learning (RL), have shown promise in +specific scenarios, RL planners struggle with training inefficiencies and +managing large-scale, real-world driving scenarios. In this paper, we introduce +\textbf{CarPlanner}, a \textbf{C}onsistent \textbf{a}uto-\textbf{r}egressive +\textbf{Planner} that uses RL to generate multi-modal trajectories. The +auto-regressive structure enables efficient large-scale RL training, while the +incorporation of consistency ensures stable policy learning by maintaining +coherent temporal consistency across time steps. Moreover, CarPlanner employs a +generation-selection framework with an expert-guided reward function and an +invariant-view module, simplifying RL training and enhancing policy +performance. Extensive analysis demonstrates that our proposed RL framework +effectively addresses the challenges of training efficiency and performance +enhancement, positioning CarPlanner as a promising solution for trajectory +planning in autonomous driving. To the best of our knowledge, we are the first +to demonstrate that the RL-based planner can surpass both IL- and rule-based +state-of-the-arts (SOTAs) on the challenging large-scale real-world dataset +nuPlan. Our proposed CarPlanner surpasses RL-, IL-, and rule-based SOTA +approaches within this demanding dataset. + +
+
+ comment: CVPR 2025 +
+
+
+
+
+ + ♻ ☆ Deep Learning-based MRI Reconstruction with Artificial Fourier Transform + Network (AFTNet) + + +
+ Deep complex-valued neural networks (CVNNs) provide a powerful way to +leverage complex number operations and representations and have succeeded in +several phase-based applications. However, previous networks have not fully +explored the impact of complex-valued networks in the frequency domain. Here, +we introduce a unified complex-valued deep learning framework-Artificial +Fourier Transform Network (AFTNet)-which combines domain-manifold learning and +CVNNs. AFTNet can be readily used to solve image inverse problems in domain +transformation, especially for accelerated magnetic resonance imaging (MRI) +reconstruction and other applications. While conventional methods typically +utilize magnitude images or treat the real and imaginary components of k-space +data as separate channels, our approach directly processes raw k-space data in +the frequency domain, utilizing complex-valued operations. This allows for a +mapping between the frequency (k-space) and image domain to be determined +through cross-domain learning. We show that AFTNet achieves superior +accelerated MRI reconstruction compared to existing approaches. Furthermore, +our approach can be applied to various tasks, such as denoised magnetic +resonance spectroscopy (MRS) reconstruction and datasets with various +contrasts. The AFTNet presented here is a valuable preprocessing component for +different preclinical studies and provides an innovative alternative for +solving inverse problems in imaging and spectroscopy. The code is available at: +https://github.com/yanting-yang/AFT-Net. + +
+
+
+
+
+ + ♻ ☆ RoboSense: Large-scale Dataset and Benchmark for Egocentric Robot + Perception and Navigation in Crowded and Unstructured Environments CVPR2025 + + +
+ Reliable embodied perception from an egocentric perspective is challenging +yet essential for autonomous navigation technology of intelligent mobile +agents. With the growing demand of social robotics, near-field scene +understanding becomes an important research topic in the areas of egocentric +perceptual tasks related to navigation in both crowded and unstructured +environments. Due to the complexity of environmental conditions and difficulty +of surrounding obstacles owing to truncation and occlusion, the perception +capability under this circumstance is still inferior. To further enhance the +intelligence of mobile robots, in this paper, we setup an egocentric +multi-sensor data collection platform based on 3 main types of sensors (Camera, +LiDAR and Fisheye), which supports flexible sensor configurations to enable +dynamic sight of view from ego-perspective, capturing either near or farther +areas. Meanwhile, a large-scale multimodal dataset is constructed, named +RoboSense, to facilitate egocentric robot perception. Specifically, RoboSense +contains more than 133K synchronized data with 1.4M 3D bounding box and IDs +annotated in the full $360^{\circ}$ view, forming 216K trajectories across 7.6K +temporal sequences. It has $270\times$ and $18\times$ as many annotations of +surrounding obstacles within near ranges as the previous datasets collected for +autonomous driving scenarios such as KITTI and nuScenes. Moreover, we define a +novel matching criterion for near-field 3D perception and prediction metrics. +Based on RoboSense, we formulate 6 popular tasks to facilitate the future +research development, where the detailed analysis as well as benchmarks are +also provided accordingly. Data desensitization measures have been conducted +for privacy protection. + +
+
+ comment: Accepted to CVPR2025 +
+
+
+
+
+ + ♻ ☆ ArcPro: Architectural Programs for Structured 3D Abstraction of Sparse + Points CVPR 2025 + + +
+ We introduce ArcPro, a novel learning framework built on architectural +programs to recover structured 3D abstractions from highly sparse and +low-quality point clouds. Specifically, we design a domain-specific language +(DSL) to hierarchically represent building structures as a program, which can +be efficiently converted into a mesh. We bridge feedforward and inverse +procedural modeling by using a feedforward process for training data synthesis, +allowing the network to make reverse predictions. We train an encoder-decoder +on the points-program pairs to establish a mapping from unstructured point +clouds to architectural programs, where a 3D convolutional encoder extracts +point cloud features and a transformer decoder autoregressively predicts the +programs in a tokenized form. Inference by our method is highly efficient and +produces plausible and faithful 3D abstractions. Comprehensive experiments +demonstrate that ArcPro outperforms both traditional architectural proxy +reconstruction and learning-based abstraction methods. We further explore its +potential to work with multi-view image and natural language inputs. + +
+
+ comment: CVPR 2025 (Patent Protected); Project page: + https://vcc.tech/research/2025/ArcPro +
+
+
+
+
+ + ♻ ☆ Dynamic Sparse Training versus Dense Training: The Unexpected Winner in + Image Corruption Robustness ICLR 2025 + + +
+ It is generally perceived that Dynamic Sparse Training opens the door to a +new era of scalability and efficiency for artificial neural networks at, +perhaps, some costs in accuracy performance for the classification task. At the +same time, Dense Training is widely accepted as being the "de facto" approach +to train artificial neural networks if one would like to maximize their +robustness against image corruption. In this paper, we question this general +practice. Consequently, we claim that, contrary to what is commonly thought, +the Dynamic Sparse Training methods can consistently outperform Dense Training +in terms of robustness accuracy, particularly if the efficiency aspect is not +considered as a main objective (i.e., sparsity levels between 10% and up to +50%), without adding (or even reducing) resource cost. We validate our claim on +two types of data, images and videos, using several traditional and modern deep +learning architectures for computer vision and three widely studied Dynamic +Sparse Training algorithms. Our findings reveal a new yet-unknown benefit of +Dynamic Sparse Training and open new possibilities in improving deep learning +robustness beyond the current state of the art. + +
+
+ comment: Accepted at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ VL-Nav: Real-time Vision-Language Navigation with Spatial Reasoning + + +
+ Vision-language navigation in unknown environments is crucial for mobile +robots. In scenarios such as household assistance and rescue, mobile robots +need to understand a human command, such as "find a person wearing black". We +present a novel vision-language navigation (VL-Nav) system that integrates +efficient spatial reasoning on low-power robots. Unlike prior methods that rely +on a single image-level feature similarity to guide a robot, our method +integrates pixel-wise vision-language features with curiosity-driven +exploration. This approach enables robust navigation to human-instructed +instances across diverse environments. We deploy VL-Nav on a four-wheel mobile +robot and evaluate its performance through comprehensive navigation tasks in +both indoor and outdoor environments, spanning different scales and semantic +complexities. Remarkably, VL-Nav operates at a real-time frequency of 30 Hz +with a Jetson Orin NX, highlighting its ability to conduct efficient +vision-language navigation. Results show that VL-Nav achieves an overall +success rate of 86.3%, outperforming previous methods by 44.15%. + +
+
+
+
+
+ + ♻ ☆ Near-infrared Image Deblurring and Event Denoising with Synergistic + Neuromorphic Imaging + + +
+ The fields of imaging in the nighttime dynamic and other extremely dark +conditions have seen impressive and transformative advancements in recent +years, partly driven by the rise of novel sensing approaches, e.g., +near-infrared (NIR) cameras with high sensitivity and event cameras with +minimal blur. However, inappropriate exposure ratios of near-infrared cameras +make them susceptible to distortion and blur. Event cameras are also highly +sensitive to weak signals at night yet prone to interference, often generating +substantial noise and significantly degrading observations and analysis. +Herein, we develop a new framework for low-light imaging combined with NIR +imaging and event-based techniques, named synergistic neuromorphic imaging, +which can jointly achieve NIR image deblurring and event denoising. Harnessing +cross-modal features of NIR images and visible events via spectral consistency +and higher-order interaction, the NIR images and events are simultaneously +fused, enhanced, and bootstrapped. Experiments on real and realistically +simulated sequences demonstrate the effectiveness of our method and indicate +better accuracy and robustness than other methods in practical scenarios. This +study gives impetus to enhance both NIR images and events, which paves the way +for high-fidelity low-light imaging and neuromorphic reasoning. + +
+
+
+
+
+ + ♻ ☆ STAA-SNN: Spatial-Temporal Attention Aggregator for Spiking Neural + Networks CVPR 2025 + + +
+ Spiking Neural Networks (SNNs) have gained significant attention due to their +biological plausibility and energy efficiency, making them promising +alternatives to Artificial Neural Networks (ANNs). However, the performance gap +between SNNs and ANNs remains a substantial challenge hindering the widespread +adoption of SNNs. In this paper, we propose a Spatial-Temporal Attention +Aggregator SNN (STAA-SNN) framework, which dynamically focuses on and captures +both spatial and temporal dependencies. First, we introduce a spike-driven +self-attention mechanism specifically designed for SNNs. Additionally, we +pioneeringly incorporate position encoding to integrate latent temporal +relationships into the incoming features. For spatial-temporal information +aggregation, we employ step attention to selectively amplify relevant features +at different steps. Finally, we implement a time-step random dropout strategy +to avoid local optima. As a result, STAA-SNN effectively captures both spatial +and temporal dependencies, enabling the model to analyze complex patterns and +make accurate predictions. The framework demonstrates exceptional performance +across diverse datasets and exhibits strong generalization capabilities. +Notably, STAA-SNN achieves state-of-the-art results on neuromorphic datasets +CIFAR10-DVS, with remarkable performances of 97.14%, 82.05% and 70.40% on the +static datasets CIFAR-10, CIFAR-100 and ImageNet, respectively. Furthermore, +our model exhibits improved performance ranging from 0.33\% to 2.80\% with +fewer time steps. The code for the model is available on GitHub. + +
+
+ comment: Accepted by CVPR 2025 +
+
+
+
+
+ + ♻ ☆ Solving the Catastrophic Forgetting Problem in Generalized Category + Discovery CVPR 2024 + + +
+ Generalized Category Discovery (GCD) aims to identify a mix of known and +novel categories within unlabeled data sets, providing a more realistic setting +for image recognition. Essentially, GCD needs to remember existing patterns +thoroughly to recognize novel categories. Recent state-of-the-art method SimGCD +transfers the knowledge from known-class data to the learning of novel classes +through debiased learning. However, some patterns are catastrophically forgot +during adaptation and thus lead to poor performance in novel categories +classification. To address this issue, we propose a novel learning approach, +LegoGCD, which is seamlessly integrated into previous methods to enhance the +discrimination of novel classes while maintaining performance on previously +encountered known classes. Specifically, we design two types of techniques +termed as Local Entropy Regularization (LER) and Dual-views Kullback Leibler +divergence constraint (DKL). The LER optimizes the distribution of potential +known class samples in unlabeled data, thus ensuring the preservation of +knowledge related to known categories while learning novel classes. Meanwhile, +DKL introduces Kullback Leibler divergence to encourage the model to produce a +similar prediction distribution of two view samples from the same image. In +this way, it successfully avoids mismatched prediction and generates more +reliable potential known class samples simultaneously. Extensive experiments +validate that the proposed LegoGCD effectively addresses the known category +forgetting issue across all datasets, eg, delivering a 7.74% and 2.51% accuracy +boost on known and novel classes in CUB, respectively. Our code is available +at: https://github.com/Cliffia123/LegoGCD. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ A Multi-Sensor Fusion Approach for Rapid Orthoimage Generation in + Large-Scale UAV Mapping + + +
+ Rapid generation of large-scale orthoimages from Unmanned Aerial Vehicles +(UAVs) has been a long-standing focus of research in the field of aerial +mapping. A multi-sensor UAV system, integrating the Global Positioning System +(GPS), Inertial Measurement Unit (IMU), 4D millimeter-wave radar and camera, +can provide an effective solution to this problem. In this paper, we utilize +multi-sensor data to overcome the limitations of conventional orthoimage +generation methods in terms of temporal performance, system robustness, and +geographic reference accuracy. A prior-pose-optimized feature matching method +is introduced to enhance matching speed and accuracy, reducing the number of +required features and providing precise references for the Structure from +Motion (SfM) process. The proposed method exhibits robustness in low-texture +scenes like farmlands, where feature matching is difficult. Experiments show +that our approach achieves accurate feature matching orthoimage generation in a +short time. The proposed drone system effectively aids in farmland detection +and management. + +
+
+
+
+
+ + ♻ ☆ KiVA: Kid-inspired Visual Analogies for Testing Large Multimodal Models + + +
+ This paper investigates visual analogical reasoning in large multimodal +models (LMMs) compared to human adults and children. A "visual analogy" is an +abstract rule inferred from one image and applied to another. While benchmarks +exist for testing visual reasoning in LMMs, they require advanced skills and +omit basic visual analogies that even young children can make. Inspired by +developmental psychology, we propose a new benchmark of 4,300 visual +transformations of everyday objects to test LMMs on visual analogical reasoning +and compare them to children (ages three to five) and to adults. We structure +the evaluation into three stages: identifying what changed (e.g., color, +number, etc.), how it changed (e.g., added one object), and applying the rule +to new scenarios. Our findings show that while GPT-o1, GPT-4V, LLaVA-1.5, and +MANTIS identify the "what" effectively, they struggle with quantifying the +"how" and extrapolating this rule to new objects. In contrast, children and +adults exhibit much stronger analogical reasoning at all three stages. +Additionally, the strongest tested model, GPT-o1, performs better in tasks +involving simple surface-level visual attributes like color and size, +correlating with quicker human adult response times. Conversely, more complex +tasks such as number, rotation, and reflection, which necessitate extensive +cognitive processing and understanding of extrinsic spatial properties in the +physical world, present more significant challenges. Altogether, these findings +highlight the limitations of training models on data that primarily consists of +2D images and text. + +
+
+ comment: 10 pages. Project website: https://ey242.github.io/kiva.github.io/. + Benchmark and code: https://github.com/ey242/KiVA +
+
+
+
+
+ + ♻ LiFT: Leveraging Human Feedback for Text-to-Video Model Alignment + + +
+ Recent advances in text-to-video (T2V) generative models have shown +impressive capabilities. However, these models are still inadequate in aligning +synthesized videos with human preferences (e.g., accurately reflecting text +descriptions), which is particularly difficult to address, as human preferences +are subjective and challenging to formalize as objective functions. Existing +studies train video quality assessment models that rely on human-annotated +ratings for video evaluation but overlook the reasoning behind evaluations, +limiting their ability to capture nuanced human criteria. Moreover, aligning +T2V model using video-based human feedback remains unexplored. Therefore, this +paper proposes LiFT, the first method designed to leverage human feedback for +T2V model alignment. Specifically, we first construct a Human Rating Annotation +dataset, LiFT-HRA, consisting of approximately 10k human annotations, each +including a score and its corresponding rationale. Based on this, we train a +reward model LiFT-Critic to learn reward function effectively, which serves as +a proxy for human judgment, measuring the alignment between given videos and +human expectations. Lastly, we leverage the learned reward function to align +the T2V model by maximizing the reward-weighted likelihood. As a case study, we +apply our pipeline to CogVideoX-2B, showing that the fine-tuned model +outperforms the CogVideoX-5B across all 16 metrics, highlighting the potential +of human feedback in improving the alignment and quality of synthesized videos. + +
+
+ comment: Project page: https://codegoat24.github.io/LiFT +
+
+
+
+
+ + ♻ ☆ LCV2I: Communication-Efficient and High-Performance Collaborative + Perception Framework with Low-Resolution LiDAR + + +
+ Vehicle-to-Infrastructure (V2I) collaborative perception leverages data +collected by infrastructure's sensors to enhance vehicle perceptual +capabilities. LiDAR, as a commonly used sensor in cooperative perception, is +widely equipped in intelligent vehicles and infrastructure. However, its +superior performance comes with a correspondingly high cost. To achieve +low-cost V2I, reducing the cost of LiDAR is crucial. Therefore, we study +adopting low-resolution LiDAR on the vehicle to minimize cost as much as +possible. However, simply reducing the resolution of vehicle's LiDAR results in +sparse point clouds, making distant small objects even more blurred. +Additionally, traditional communication methods have relatively low bandwidth +utilization efficiency. These factors pose challenges for us. To balance cost +and perceptual accuracy, we propose a new collaborative perception framework, +namely LCV2I. LCV2I uses data collected from cameras and low-resolution LiDAR +as input. It also employs feature offset correction modules and regional +feature enhancement algorithms to improve feature representation. Finally, we +use regional difference map and regional score map to assess the value of +collaboration content, thereby improving communication bandwidth efficiency. In +summary, our approach achieves high perceptual performance while substantially +reducing the demand for high-resolution sensors on the vehicle. To evaluate +this algorithm, we conduct 3D object detection in the real-world scenario of +DAIR-V2X, demonstrating that the performance of LCV2I consistently surpasses +currently existing algorithms. + +
+
+
+
+
+ + ♻ ☆ Detecting Adversarial Data using Perturbation Forgery CVPR 2025 + + +
+ As a defense strategy against adversarial attacks, adversarial detection aims +to identify and filter out adversarial data from the data flow based on +discrepancies in distribution and noise patterns between natural and +adversarial data. Although previous detection methods achieve high performance +in detecting gradient-based adversarial attacks, new attacks based on +generative models with imbalanced and anisotropic noise patterns evade +detection. Even worse, the significant inference time overhead and limited +performance against unseen attacks make existing techniques impractical for +real-world use. In this paper, we explore the proximity relationship among +adversarial noise distributions and demonstrate the existence of an open +covering for these distributions. By training on the open covering of +adversarial noise distributions, a detector with strong generalization +performance against various types of unseen attacks can be developed. Based on +this insight, we heuristically propose Perturbation Forgery, which includes +noise distribution perturbation, sparse mask generation, and pseudo-adversarial +data production, to train an adversarial detector capable of detecting any +unseen gradient-based, generative-based, and physical adversarial attacks. +Comprehensive experiments conducted on multiple general and facial datasets, +with a wide spectrum of attacks, validate the strong generalization of our +method. + +
+
+ comment: Accepted as a conference paper at CVPR 2025 +
+
+
+
+
+ + ♻ ☆ CMMLoc: Advancing Text-to-PointCloud Localization with + Cauchy-Mixture-Model Based Framework CVPR 2025 + + +
+ The goal of point cloud localization based on linguistic description is to +identify a 3D position using textual description in large urban environments, +which has potential applications in various fields, such as determining the +location for vehicle pickup or goods delivery. Ideally, for a textual +description and its corresponding 3D location, the objects around the 3D +location should be fully described in the text description. However, in +practical scenarios, e.g., vehicle pickup, passengers usually describe only the +part of the most significant and nearby surroundings instead of the entire +environment. In response to this $\textbf{partially relevant}$ challenge, we +propose $\textbf{CMMLoc}$, an uncertainty-aware +$\textbf{C}$auchy-$\textbf{M}$ixture-$\textbf{M}$odel ($\textbf{CMM}$) based +framework for text-to-point-cloud $\textbf{Loc}$alization. To model the +uncertain semantic relations between text and point cloud, we integrate CMM +constraints as a prior during the interaction between the two modalities. We +further design a spatial consolidation scheme to enable adaptive aggregation of +different 3D objects with varying receptive fields. To achieve precise +localization, we propose a cardinal direction integration module alongside a +modality pre-alignment strategy, helping capture the spatial relationships +among objects and bringing the 3D objects closer to the text modality. +Comprehensive experiments validate that CMMLoc outperforms existing methods, +achieving state-of-the-art results on the KITTI360Pose dataset. Codes are +available in this GitHub repository https://github.com/kevin301342/CMMLoc. + +
+
+ comment: Accepted by CVPR 2025 +
+
+
+
+
+ + ♻ ☆ Uni-Renderer: Unifying Rendering and Inverse Rendering Via Dual Stream + Diffusion + + +
+ Rendering and inverse rendering are pivotal tasks in both computer vision and +graphics. The rendering equation is the core of the two tasks, as an ideal +conditional distribution transfer function from intrinsic properties to RGB +images. Despite achieving promising results of existing rendering methods, they +merely approximate the ideal estimation for a specific scene and come with a +high computational cost. Additionally, the inverse conditional distribution +transfer is intractable due to the inherent ambiguity. To address these +challenges, we propose a data-driven method that jointly models rendering and +inverse rendering as two conditional generation tasks within a single diffusion +framework. Inspired by UniDiffuser, we utilize two distinct time schedules to +model both tasks, and with a tailored dual streaming module, we achieve +cross-conditioning of two pre-trained diffusion models. This unified approach, +named Uni-Renderer, allows the two processes to facilitate each other through a +cycle-consistent constrain, mitigating ambiguity by enforcing consistency +between intrinsic properties and rendered images. Combined with a meticulously +prepared dataset, our method effectively decomposition of intrinsic properties +and demonstrates a strong capability to recognize changes during rendering. We +will open-source our training and inference code to the public, fostering +further research and development in this area. + +
+
+
+
+
+ + ♻ ☆ Category-level Meta-learned NeRF Priors for Efficient Object Mapping + + +
+ In 3D object mapping, category-level priors enable efficient object +reconstruction and canonical pose estimation, requiring only a single prior per +semantic category (e.g., chair, book, laptop). Recently, DeepSDF has +predominantly been used as a category-level shape prior, but it struggles to +reconstruct sharp geometry and is computationally expensive. In contrast, NeRFs +capture fine details but have yet to be effectively integrated with +category-level priors in a real-time multi-object mapping framework. To bridge +this gap, we introduce PRENOM, a Prior-based Efficient Neural Object Mapper +that integrates category-level priors with object-level NeRFs to enhance +reconstruction efficiency while enabling canonical object pose estimation. +PRENOM gets to know objects on a first-name basis by meta-learning on synthetic +reconstruction tasks generated from open-source shape datasets. To account for +object category variations, it employs a multi-objective genetic algorithm to +optimize the NeRF architecture for each category, balancing reconstruction +quality and training time. Additionally, prior-based probabilistic ray sampling +directs sampling toward expected object regions, accelerating convergence and +improving reconstruction quality under constrained resources. Experimental +results on a low-end GPU highlight the ability of PRENOM to achieve +high-quality reconstructions while maintaining computational feasibility. +Specifically, comparisons with prior-free NeRF-based approaches on a synthetic +dataset show a 21% lower Chamfer distance, demonstrating better reconstruction +quality. Furthermore, evaluations against other approaches using shape priors +on a noisy real-world dataset indicate a 13% improvement averaged across all +reconstruction metrics, and comparable pose and size estimation accuracy, while +being trained for 5x less time. + +
+
+
+
+
+ + ♻ ☆ Adapting Pre-Trained Vision Models for Novel Instance Detection and + Segmentation DSN + + +
+ Novel Instance Detection and Segmentation (NIDS) aims at detecting and +segmenting novel object instances given a few examples of each instance. We +propose a unified, simple, yet effective framework (NIDS-Net) comprising object +proposal generation, embedding creation for both instance templates and +proposal regions, and embedding matching for instance label assignment. +Leveraging recent advancements in large vision methods, we utilize Grounding +DINO and Segment Anything Model (SAM) to obtain object proposals with accurate +bounding boxes and masks. Central to our approach is the generation of +high-quality instance embeddings. We utilized foreground feature averages of +patch embeddings from the DINOv2 ViT backbone, followed by refinement through a +weight adapter mechanism that we introduce. + We show experimentally that our weight adapter can adjust the embeddings +locally within their feature space and effectively limit overfitting in the +few-shot setting. Furthermore, the weight adapter optimizes weights to enhance +the distinctiveness of instance embeddings during similarity computation. This +methodology enables a straightforward matching strategy that results in +significant performance gains. Our framework surpasses current state-of-the-art +methods, demonstrating notable improvements in four detection datasets. In the +segmentation tasks on seven core datasets of the BOP challenge, our method +outperforms the leading published RGB methods and remains competitive with the +best RGB-D method. We have also verified our method using real-world images +from a Fetch robot and a RealSense camera. Project Page: +https://irvlutd.github.io/NIDSNet/ + +
+
+ comment: Project Page: https://irvlutd.github.io/NIDSNet/ +
+
+
+
+
+ + ♻ ☆ MobileViM: A Light-weight and Dimension-independent Vision Mamba for 3D + Medical Image Analysis + + +
+ Efficient evaluation of three-dimensional (3D) medical images is crucial for +diagnostic and therapeutic practices in healthcare. Recent years have seen a +substantial uptake in applying deep learning and computer vision to analyse and +interpret medical images. Traditional approaches, such as convolutional neural +networks (CNNs) and vision transformers (ViTs), face significant computational +challenges, prompting the need for architectural advancements. Recent efforts +have led to the introduction of novel architectures like the ``Mamba'' model as +alternative solutions to traditional CNNs or ViTs. The Mamba model excels in +the linear processing of one-dimensional data with low computational demands. +However, Mamba's potential for 3D medical image analysis remains underexplored +and could face significant computational challenges as the dimension increases. +This manuscript presents MobileViM, a streamlined architecture for efficient +segmentation of 3D medical images. In the MobileViM network, we invent a new +dimension-independent mechanism and a dual-direction traversing approach to +incorporate with a vision-Mamba-based framework. MobileViM also features a +cross-scale bridging technique to improve efficiency and accuracy across +various medical imaging modalities. With these enhancements, MobileViM achieves +segmentation speeds exceeding 90 frames per second (FPS) on a single graphics +processing unit (i.e., NVIDIA RTX 4090). This performance is over 24 FPS faster +than the state-of-the-art deep learning models for processing 3D images with +the same computational resources. In addition, experimental evaluations +demonstrate that MobileViM delivers superior performance, with Dice similarity +scores reaching 92.72%, 86.69%, 80.46%, and 77.43% for PENGWIN, BraTS2024, +ATLAS, and Toothfairy2 datasets, respectively, which significantly surpasses +existing models. + +
+
+
+
+
+ + ♻ ☆ Vision-based Geo-Localization of Future Mars Rotorcraft in Challenging + Illumination Conditions + + +
+ Planetary exploration using aerial assets has the potential for unprecedented +scientific discoveries on Mars. While NASA's Mars helicopter Ingenuity proved +flight in Martian atmosphere is possible, future Mars rotocrafts will require +advanced navigation capabilities for long-range flights. One such critical +capability is Map-based Localization (MbL) which registers an onboard image to +a reference map during flight in order to mitigate cumulative drift from visual +odometry. However, significant illumination differences between rotocraft +observations and a reference map prove challenging for traditional MbL systems, +restricting the operational window of the vehicle. In this work, we investigate +a new MbL system and propose Geo-LoFTR, a geometry-aided deep learning model +for image registration that is more robust under large illumination differences +than prior models. The system is supported by a custom simulation framework +that uses real orbital maps to produce large amounts of realistic images of the +Martian terrain. Comprehensive evaluations show that our proposed system +outperforms prior MbL efforts in terms of localization accuracy under +significant lighting and scale variations. Furthermore, we demonstrate the +validity of our approach across a simulated Martian day. + +
+
+
+
+
+ + ♻ ☆ LS-HAR: Language Supervised Human Action Recognition with Salient + Fusion, Construction Sites as a Use-Case + + +
+ Detecting human actions is a crucial task for autonomous robots and vehicles, +often requiring the integration of various data modalities for improved +accuracy. In this study, we introduce a novel approach to Human Action +Recognition (HAR) using language supervision named LS-HAR based on skeleton and +visual cues. Our method leverages a language model to guide the feature +extraction process in the skeleton encoder. Specifically, we employ learnable +prompts for the language model conditioned on the skeleton modality to optimize +feature representation. Furthermore, we propose a fusion mechanism that +combines dual-modality features using a salient fusion module, incorporating +attention and transformer mechanisms to address the modalities' high +dimensionality. This fusion process prioritizes informative video frames and +body joints, enhancing the recognition accuracy of human actions. Additionally, +we introduce a new dataset tailored for real-world robotic applications in +construction sites, featuring visual, skeleton, and depth data modalities, +named VolvoConstAct. This dataset serves to facilitate the training and +evaluation of machine learning models to instruct autonomous construction +machines for performing necessary tasks in real-world construction sites. To +evaluate our approach, we conduct experiments on our dataset as well as three +widely used public datasets: NTU-RGB+D, NTU-RGB+D 120, and NW-UCLA. Results +reveal that our proposed method achieves promising performance across all +datasets, demonstrating its robustness and potential for various applications. +The code, dataset, and demonstration of real-machine experiments are available +at: https://mmahdavian.github.io/ls_har/ + +
+
+
+
+
+ + ♻ ☆ CSCPR: Cross-Source-Context Indoor RGB-D Place Recognition + + +
+ We extend our previous work, PoCo, and present a new algorithm, +Cross-Source-Context Place Recognition (CSCPR), for RGB-D indoor place +recognition that integrates global retrieval and reranking into an end-to-end +model and keeps the consistency of using Context-of-Clusters (CoCs) for feature +processing. Unlike prior approaches that primarily focus on the RGB domain for +place recognition reranking, CSCPR is designed to handle the RGB-D data. We +apply the CoCs to handle cross-sourced and cross-scaled RGB-D point clouds and +introduce two novel modules for reranking: the Self-Context Cluster (SCC) and +the Cross Source Context Cluster (CSCC), which enhance feature representation +and match query-database pairs based on local features, respectively. We also +release two new datasets, ScanNetIPR and ARKitIPR. Our experiments demonstrate +that CSCPR significantly outperforms state-of-the-art models on these datasets +by at least 29.27% in Recall@1 on the ScanNet-PR dataset and 43.24% in the new +datasets. Code and datasets will be released. + +
+
+
+
+
+
+
+
+ + Artificial Intelligence 150 + +
+
+
+ + ☆ The MASK Benchmark: Disentangling Honesty From Accuracy in AI Systems + + +
+ As large language models (LLMs) become more capable and agentic, the +requirement for trust in their outputs grows significantly, yet at the same +time concerns have been mounting that models may learn to lie in pursuit of +their goals. To address these concerns, a body of work has emerged around the +notion of "honesty" in LLMs, along with interventions aimed at mitigating +deceptive behaviors. However, evaluations of honesty are currently highly +limited, with no benchmark combining large scale and applicability to all +models. Moreover, many benchmarks claiming to measure honesty in fact simply +measure accuracy--the correctness of a model's beliefs--in disguise. In this +work, we introduce a large-scale human-collected dataset for measuring honesty +directly, allowing us to disentangle accuracy from honesty for the first time. +Across a diverse set of LLMs, we find that while larger models obtain higher +accuracy on our benchmark, they do not become more honest. Surprisingly, while +most frontier LLMs obtain high scores on truthfulness benchmarks, we find a +substantial propensity in frontier LLMs to lie when pressured to do so, +resulting in low honesty scores on our benchmark. We find that simple methods, +such as representation engineering interventions, can improve honesty. These +results underscore the growing need for robust evaluations and effective +interventions to ensure LLMs remain trustworthy. + +
+
+ comment: Website: https://www.mask-benchmark.ai +
+
+
+
+
+ + ☆ Process-based Self-Rewarding Language Models + + +
+ Large Language Models have demonstrated outstanding performance across +various downstream tasks and have been widely applied in multiple scenarios. +Human-annotated preference data is used for training to further improve LLMs' +performance, which is constrained by the upper limit of human performance. +Therefore, Self-Rewarding method has been proposed, where LLMs generate +training data by rewarding their own outputs. However, the existing +self-rewarding paradigm is not effective in mathematical reasoning scenarios +and may even lead to a decline in performance. In this work, we propose the +Process-based Self-Rewarding pipeline for language models, which introduces +long-thought reasoning, step-wise LLM-as-a-Judge, and step-wise preference +optimization within the self-rewarding paradigm. Our new paradigm successfully +enhances the performance of LLMs on multiple mathematical reasoning benchmarks +through iterative Process-based Self-Rewarding, demonstrating the immense +potential of self-rewarding to achieve LLM reasoning that may surpass human +capabilities. + +
+
+
+
+
+ + ☆ CHOP: Mobile Operating Assistant with Constrained High-frequency + Optimized Subtask Planning + + +
+ The advancement of visual language models (VLMs) has enhanced mobile device +operations, allowing simulated human-like actions to address user requirements. +Current VLM-based mobile operating assistants can be structured into three +levels: task, subtask, and action. The subtask level, linking high-level goals +with low-level executable actions, is crucial for task completion but faces two +challenges: ineffective subtasks that lower-level agent cannot execute and +inefficient subtasks that fail to contribute to the completion of the +higher-level task. These challenges stem from VLM's lack of experience in +decomposing subtasks within GUI scenarios in multi-agent architecture. To +address these, we propose a new mobile assistant architecture with constrained +high-frequency o}ptimized planning (CHOP). Our approach overcomes the VLM's +deficiency in GUI scenarios planning by using human-planned subtasks as the +basis vector. We evaluate our architecture in both English and Chinese contexts +across 20 Apps, demonstrating significant improvements in both effectiveness +and efficiency. Our dataset and code is available at +https://github.com/Yuqi-Zhou/CHOP + +
+
+
+
+
+ + ☆ Rethinking Deep Clustering Paradigms: Self-Supervision Is All You Need + + +
+ The recent advances in deep clustering have been made possible by significant +progress in self-supervised and pseudo-supervised learning. However, the +trade-off between self-supervision and pseudo-supervision can give rise to +three primary issues. The joint training causes Feature Randomness and Feature +Drift, whereas the independent training causes Feature Randomness and Feature +Twist. In essence, using pseudo-labels generates random and unreliable +features. The combination of pseudo-supervision and self-supervision drifts the +reliable clustering-oriented features. Moreover, moving from self-supervision +to pseudo-supervision can twist the curved latent manifolds. This paper +addresses the limitations of existing deep clustering paradigms concerning +Feature Randomness, Feature Drift, and Feature Twist. We propose a new paradigm +with a new strategy that replaces pseudo-supervision with a second round of +self-supervision training. The new strategy makes the transition between +instance-level self-supervision and neighborhood-level self-supervision +smoother and less abrupt. Moreover, it prevents the drifting effect that is +caused by the strong competition between instance-level self-supervision and +clustering-level pseudo-supervision. Moreover, the absence of the +pseudo-supervision prevents the risk of generating random features. With this +novel approach, our paper introduces a Rethinking of the Deep Clustering +Paradigms, denoted by R-DC. Our model is specifically designed to address three +primary challenges encountered in Deep Clustering: Feature Randomness, Feature +Drift, and Feature Twist. Experimental results conducted on six datasets have +shown that the two-level self-supervision training yields substantial +improvements. + +
+
+
+
+
+ + ☆ Deep Causal Behavioral Policy Learning: Applications to Healthcare + + +
+ We present a deep learning-based approach to studying dynamic clinical +behavioral regimes in diverse non-randomized healthcare settings. Our proposed +methodology - deep causal behavioral policy learning (DC-BPL) - uses deep +learning algorithms to learn the distribution of high-dimensional clinical +action paths, and identifies the causal link between these action paths and +patient outcomes. Specifically, our approach: (1) identifies the causal effects +of provider assignment on clinical outcomes; (2) learns the distribution of +clinical actions a given provider would take given evolving patient +information; (3) and combines these steps to identify the optimal provider for +a given patient type and emulate that provider's care decisions. Underlying +this strategy, we train a large clinical behavioral model (LCBM) on electronic +health records data using a transformer architecture, and demonstrate its +ability to estimate clinical behavioral policies. We propose a novel +interpretation of a behavioral policy learned using the LCBM: that it is an +efficient encoding of complex, often implicit, knowledge used to treat a +patient. This allows us to learn a space of policies that are critical to a +wide range of healthcare applications, in which the vast majority of clinical +knowledge is acquired tacitly through years of practice and only a tiny +fraction of information relevant to patient care is written down (e.g. in +textbooks, studies or standardized guidelines). + +
+
+
+
+
+ + ☆ Machine Learning in Biomechanics: Key Applications and Limitations in + Walking, Running, and Sports Movements + + +
+ This chapter provides an overview of recent and promising Machine Learning +applications, i.e. pose estimation, feature estimation, event detection, data +exploration & clustering, and automated classification, in gait (walking and +running) and sports biomechanics. It explores the potential of Machine Learning +methods to address challenges in biomechanical workflows, highlights central +limitations, i.e. data and annotation availability and explainability, that +need to be addressed, and emphasises the importance of interdisciplinary +approaches for fully harnessing the potential of Machine Learning in gait and +sports biomechanics. + +
+
+
+
+
+ + ☆ Rethinking Video Tokenization: A Conditioned Diffusion-based Approach + + +
+ Video tokenizers, which transform videos into compact latent representations, +are key to video generation. Existing video tokenizers are based on the VAE +architecture and follow a paradigm where an encoder compresses videos into +compact latents, and a deterministic decoder reconstructs the original videos +from these latents. In this paper, we propose a novel +\underline{\textbf{C}}onditioned \underline{\textbf{D}}iffusion-based video +\underline{\textbf{T}}okenizer entitled \textbf{\ourmethod}, which departs from +previous methods by replacing the deterministic decoder with a 3D causal +diffusion model. The reverse diffusion generative process of the decoder is +conditioned on the latent representations derived via the encoder. With a +feature caching and sampling acceleration, the framework efficiently +reconstructs high-fidelity videos of arbitrary lengths. Results show that +{\ourmethod} achieves state-of-the-art performance in video reconstruction +tasks using just a single-step sampling. Even a smaller version of {\ourmethod} +still achieves reconstruction results on par with the top two baselines. +Furthermore, the latent video generation model trained using {\ourmethod} also +shows superior performance. + +
+
+
+
+
+ + ☆ Curating Demonstrations using Online Experience + + +
+ Many robot demonstration datasets contain heterogeneous demonstrations of +varying quality. This heterogeneity may benefit policy pre-training, but can +hinder robot performance when used with a final imitation learning objective. +In particular, some strategies in the data may be less reliable than others or +may be underrepresented in the data, leading to poor performance when such +strategies are sampled at test time. Moreover, such unreliable or +underrepresented strategies can be difficult even for people to discern, and +sifting through demonstration datasets is time-consuming and costly. On the +other hand, policy performance when trained on such demonstrations can reflect +the reliability of different strategies. We thus propose for robots to +self-curate based on online robot experience (Demo-SCORE). More specifically, +we train and cross-validate a classifier to discern successful policy roll-outs +from unsuccessful ones and use the classifier to filter heterogeneous +demonstration datasets. Our experiments in simulation and the real world show +that Demo-SCORE can effectively identify suboptimal demonstrations without +manual curation. Notably, Demo-SCORE achieves over 15-35% higher absolute +success rate in the resulting policy compared to the base policy trained with +all original demonstrations. + +
+
+
+
+
+ + ☆ ILLC: Iterative Layer-by-Layer Compression for Enhancing Structural + Faithfulness in SpArX + + +
+ In the field of Explainable Artificial Intelligence (XAI), argumentative XAI +approaches have been proposed to represent the internal reasoning process of +deep neural networks in a more transparent way by interpreting hidden nodes as +arguements. However, as the number of layers increases, existing compression +methods simplify all layers at once, which lead to high accumulative +information loss. To compensate for this, we propose an iterative +layer-by-layer compression technique in which each layer is compressed +separately and the reduction error in the next layer is immediately compensated +for, thereby improving the overall input-output and structural fidelity of the +model. Experiments on the Breast Cancer Diagnosis dataset show that, compared +to traditional compression, the method reduces input-output and structural +unfaithfulness, and maintains a more consistent attack-support relationship in +the Argumentative Explanation scheme. This is significant because it provides a +new way to make complex MLP models more compact while still conveying their +internal inference logic without distortion. + +
+
+ comment: 8 pages, 2 figures +
+
+
+
+
+ + ☆ Attentive Reasoning Queries: A Systematic Method for Optimizing + Instruction-Following in Large Language Models + + +
+ We present Attentive Reasoning Queries (ARQs), a novel structured reasoning +approach that significantly improves instruction-following in Large Language +Models through domain-specialized reasoning blueprints. While LLMs demonstrate +remarkable capabilities across diverse tasks, they often fail to maintain +adherence to complex, use-case-specific instructions during multi-turn +conversations, presenting challenges for business-critical applications. ARQs +address this limitation by guiding LLMs through systematic reasoning steps with +targeted queries that reinstate critical instructions and facilitate +intermediate reasoning throughout the completion process. In extensive testing +within Parlant, our framework for reliable customer-facing agents in which ARQs +were born out of necessity, they achieved a 90.2% success rate across 87 test +scenarios, outperforming both Chain-of-Thought reasoning (86.1%) and direct +response generation (81.5%). ARQs showed particular strength in addressing +persistent failure modes like guideline re-application and hallucination +prevention. Our analysis also revealed that ARQs can potentially be more +computationally efficient than free-form reasoning when carefully designed. +These findings demonstrate that structured reasoning approaches provide +effective mechanisms for controlling how LLMs process information and make +decisions in complex scenarios. + +
+
+ comment: Supplementary materials, including code, is available on our GitHub: + https://github.com/emcie-co/parlant/tree/arqs-a-systematic-method-for-optimizing-instruction-following-in-llms +
+
+
+
+
+ + ☆ A Generative Approach to High Fidelity 3D Reconstruction from Text Data + + +
+ The convergence of generative artificial intelligence and advanced computer +vision technologies introduces a groundbreaking approach to transforming +textual descriptions into three-dimensional representations. This research +proposes a fully automated pipeline that seamlessly integrates text-to-image +generation, various image processing techniques, and deep learning methods for +reflection removal and 3D reconstruction. By leveraging state-of-the-art +generative models like Stable Diffusion, the methodology translates natural +language inputs into detailed 3D models through a multi-stage workflow. + The reconstruction process begins with the generation of high-quality images +from textual prompts, followed by enhancement by a reinforcement learning agent +and reflection removal using the Stable Delight model. Advanced image upscaling +and background removal techniques are then applied to further enhance visual +fidelity. These refined two-dimensional representations are subsequently +transformed into volumetric 3D models using sophisticated machine learning +algorithms, capturing intricate spatial relationships and geometric +characteristics. This process achieves a highly structured and detailed output, +ensuring that the final 3D models reflect both semantic accuracy and geometric +precision. + This approach addresses key challenges in generative reconstruction, such as +maintaining semantic coherence, managing geometric complexity, and preserving +detailed visual information. Comprehensive experimental evaluations will assess +reconstruction quality, semantic accuracy, and geometric fidelity across +diverse domains and varying levels of complexity. By demonstrating the +potential of AI-driven 3D reconstruction techniques, this research offers +significant implications for fields such as augmented reality (AR), virtual +reality (VR), and digital content creation. + +
+
+
+
+
+ + ☆ Improving 6D Object Pose Estimation of metallic Household and Industry + Objects + + +
+ 6D object pose estimation suffers from reduced accuracy when applied to +metallic objects. We set out to improve the state-of-the-art by addressing +challenges such as reflections and specular highlights in industrial +applications. Our novel BOP-compatible dataset, featuring a diverse set of +metallic objects (cans, household, and industrial items) under various lighting +and background conditions, provides additional geometric and visual cues. We +demonstrate that these cues can be effectively leveraged to enhance overall +performance. To illustrate the usefulness of the additional features, we +improve upon the GDRNPP algorithm by introducing an additional keypoint +prediction and material estimator head in order to improve spatial scene +understanding. Evaluations on the new dataset show improved accuracy for +metallic objects, supporting the hypothesis that additional geometric and +visual cues can improve learning. + +
+
+
+
+
+ + ☆ Improving Neutral Point of View Text Generation through + Parameter-Efficient Reinforcement Learning and a Small-Scale High-Quality + Dataset + + +
+ This paper describes the construction of a dataset and the evaluation of +training methods to improve generative large language models' (LLMs) ability to +answer queries on sensitive topics with a Neutral Point of View (NPOV), i.e., +to provide significantly more informative, diverse and impartial answers. The +dataset, the SHQ-NPOV dataset, comprises 300 high-quality, human-written +quadruplets: a query on a sensitive topic, an answer, an NPOV rating, and a set +of links to source texts elaborating the various points of view. The first key +contribution of this paper is a new methodology to create such datasets through +iterative rounds of human peer-critique and annotator training, which we +release alongside the dataset. The second key contribution is the +identification of a highly effective training regime for parameter-efficient +reinforcement learning (PE-RL) to improve NPOV generation. We compare and +extensively evaluate PE-RL and multiple baselines-including LoRA finetuning (a +strong baseline), SFT and RLHF. + PE-RL not only improves on overall NPOV quality compared to the strongest +baseline ($97.06\%\rightarrow 99.08\%$), but also scores much higher on +features linguists identify as key to separating good answers from the best +answers ($60.25\%\rightarrow 85.21\%$ for presence of supportive details, +$68.74\%\rightarrow 91.43\%$ for absence of oversimplification). A qualitative +analysis corroborates this. Finally, our evaluation finds no statistical +differences between results on topics that appear in the training dataset and +those on separated evaluation topics, which provides strong evidence that our +approach to training PE-RL exhibits very effective out of topic generalization. + +
+
+
+
+
+ + ☆ Decoupled Recommender Systems: Exploring Alternative Recommender + Ecosystem Designs + + +
+ Recommender ecosystems are an emerging subject of research. Such research +examines how the characteristics of algorithms, recommendation consumers, and +item providers influence system dynamics and long-term outcomes. One +architectural possibility that has not yet been widely explored in this line of +research is the consequences of a configuration in which recommendation +algorithms are decoupled from the platforms they serve. This is sometimes +called "the friendly neighborhood algorithm store" or "middleware" model. We +are particularly interested in how such architectures might offer a range of +different distributions of utility across consumers, providers, and +recommendation platforms. In this paper, we create a model of a recommendation +ecosystem that incorporates algorithm choice and examine the outcomes of such a +design. + +
+
+
+
+
+ + ☆ Towards Understanding Text Hallucination of Diffusion Models via Local + Generation Bias + + +
+ Score-based diffusion models have achieved incredible performance in +generating realistic images, audio, and video data. While these models produce +high-quality samples with impressive details, they often introduce unrealistic +artifacts, such as distorted fingers or hallucinated texts with no meaning. +This paper focuses on textual hallucinations, where diffusion models correctly +generate individual symbols but assemble them in a nonsensical manner. Through +experimental probing, we consistently observe that such phenomenon is +attributed it to the network's local generation bias. Denoising networks tend +to produce outputs that rely heavily on highly correlated local regions, +particularly when different dimensions of the data distribution are nearly +pairwise independent. This behavior leads to a generation process that +decomposes the global distribution into separate, independent distributions for +each symbol, ultimately failing to capture the global structure, including +underlying grammar. Intriguingly, this bias persists across various denoising +network architectures including MLP and transformers which have the structure +to model global dependency. These findings also provide insights into +understanding other types of hallucinations, extending beyond text, as a result +of implicit biases in the denoising models. Additionally, we theoretically +analyze the training dynamics for a specific case involving a two-layer MLP +learning parity points on a hypercube, offering an explanation of its +underlying mechanism. + +
+
+
+
+
+ + ☆ Small but Mighty: Enhancing Time Series Forecasting with Lightweight + LLMs + + +
+ While LLMs have demonstrated remarkable potential in time series forecasting, +their practical deployment remains constrained by excessive computational +demands and memory footprints. Existing LLM-based approaches typically suffer +from three critical limitations: Inefficient parameter utilization in handling +numerical time series patterns; Modality misalignment between continuous +temporal signals and discrete text embeddings; and Inflexibility for real-time +expert knowledge integration. We present SMETimes, the first systematic +investigation of sub-3B parameter SLMs for efficient and accurate time series +forecasting. Our approach centers on three key innovations: A +statistically-enhanced prompting mechanism that bridges numerical time series +with textual semantics through descriptive statistical features; A adaptive +fusion embedding architecture that aligns temporal patterns with language model +token spaces through learnable parameters; And a dynamic mixture-of-experts +framework enabled by SLMs' computational efficiency, adaptively combining base +predictions with domain-specific models. Extensive evaluations across seven +benchmark datasets demonstrate that our 3B-parameter SLM achieves +state-of-the-art performance on five primary datasets while maintaining 3.8x +faster training and 5.2x lower memory consumption compared to 7B-parameter LLM +baselines. Notably, the proposed model exhibits better learning capabilities, +achieving 12.3% lower MSE than conventional LLM. Ablation studies validate that +our statistical prompting and cross-modal fusion modules respectively +contribute 15.7% and 18.2% error reduction in long-horizon forecasting tasks. +By redefining the efficiency-accuracy trade-off landscape, this work +establishes SLMs as viable alternatives to resource-intensive LLMs for +practical time series forecasting. Code and models are available at +https://github.com/xiyan1234567/SMETimes. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ English K_Quantization of LLMs Does Not Disproportionately Diminish + Multilingual Performance + + +
+ For consumer usage of locally deployed LLMs, the GGUF format and +k_quantization are invaluable tools for maintaining the performance of the +original model while reducing it to sizes deployable with consumer-grade +hardware. The number of bits dedicated to each weight from the original model +is reduced based on how important they are thought to be during model +inference. This importance is arrived at through the application of an +'importance matrix'-a relatively small text document meant to be representative +of the LLM's standard use-cases. In the vast majority of quants available +online, this document is primarily written in English. It was therefore an open +question whether performance on English language tasks was preserved through +the sacrifice of multilingual performance and whether it can be preserved with +alternate importance matrices. This article investigates these hypotheses by +quantizing Llama3.3 70B on importance matrices written in three languages +(English, Norwegian, and Malayalam) and evaluating them on the MixEval dataset +in both English and Norwegian. All experiments related to k_quantization +yielded non-significant results (In all cases p > 0.237) indicating that +current quantization practices do not disproportionately harm multilingual +performance. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ A Conceptual Model for Attributions in Event-Centric Knowledge Graphs + + +
+ The use of narratives as a means of fusing information from knowledge graphs +(KGs) into a coherent line of argumentation has been the subject of recent +investigation. Narratives are especially useful in event-centric knowledge +graphs in that they provide a means to connect different real-world events and +categorize them by well-known narrations. However, specifically for +controversial events, a problem in information fusion arises, namely, multiple +viewpoints regarding the validity of certain event aspects, e.g., regarding the +role a participant takes in an event, may exist. Expressing those viewpoints in +KGs is challenging because disputed information provided by different +viewpoints may introduce inconsistencies. Hence, most KGs only feature a single +view on the contained information, hampering the effectiveness of narrative +information access. This paper is an extension of our original work and +introduces attributions, i.e., parameterized predicates that allow for the +representation of facts that are only valid in a specific viewpoint. For this, +we develop a conceptual model that allows for the representation of +viewpoint-dependent information. As an extension, we enhance the model by a +conception of viewpoint-compatibility. Based on this, we deepen our original +deliberations on the model's effects on information fusion and provide +additional grounding in the literature. + +
+
+ comment: Submitted to Data & Knowledge Engineering, 22 pages, 9 figures +
+
+
+
+
+ + ☆ Towards Visual Discrimination and Reasoning of Real-World Physical + Dynamics: Physics-Grounded Anomaly Detection CVPR 2025 + + +
+ Humans detect real-world object anomalies by perceiving, interacting, and +reasoning based on object-conditioned physical knowledge. The long-term goal of +Industrial Anomaly Detection (IAD) is to enable machines to autonomously +replicate this skill. However, current IAD algorithms are largely developed and +tested on static, semantically simple datasets, which diverge from real-world +scenarios where physical understanding and reasoning are essential.To bridge +this gap, we introduce the Physics Anomaly Detection (Phys-AD) dataset, the +first large-scale, real-world, physics-grounded video dataset for industrial +anomaly detection. Collected using a real robot arm and motor, Phys-AD provides +a diverse set of dynamic, semantically rich scenarios. The dataset includes +more than 6400 videos across 22 real-world object categories, interacting with +robot arms and motors, and exhibits 47 types of anomalies. Anomaly detection in +Phys-AD requires visual reasoning, combining both physical knowledge and video +content to determine object abnormality.We benchmark state-of-the-art anomaly +detection methods under three settings: unsupervised AD, weakly-supervised AD, +and video-understanding AD, highlighting their limitations in handling +physics-grounded anomalies. Additionally, we introduce the Physics Anomaly +Explanation (PAEval) metric, designed to assess the ability of visual-language +foundation models to not only detect anomalies but also provide accurate +explanations for their underlying physical causes. Our dataset and benchmark +will be publicly available. + +
+
+ comment: Accepted by CVPR 2025 +
+
+
+
+
+ + ☆ AI-Enabled Conversational Journaling for Advancing Parkinson's Disease + Symptom Tracking + + +
+ Journaling plays a crucial role in managing chronic conditions by allowing +patients to document symptoms and medication intake, providing essential data +for long-term care. While valuable, traditional journaling methods often rely +on static, self-directed entries, lacking interactive feedback and real-time +guidance. This gap can result in incomplete or imprecise information, limiting +its usefulness for effective treatment. To address this gap, we introduce +PATRIKA, an AI-enabled prototype designed specifically for people with +Parkinson's disease (PwPD). The system incorporates cooperative conversation +principles, clinical interview simulations, and personalization to create a +more effective and user-friendly journaling experience. Through two user +studies with PwPD and iterative refinement of PATRIKA, we demonstrate +conversational journaling's significant potential in patient engagement and +collecting clinically valuable information. Our results showed that generating +probing questions PATRIKA turned journaling into a bi-directional interaction. +Additionally, we offer insights for designing journaling systems for healthcare +and future directions for promoting sustained journaling. + +
+
+ comment: To appear in the ACM CHI conference on Human Factors in Computing + Systems (CHI), 2025 +
+
+
+
+
+ + ☆ AdaSin: Enhancing Hard Sample Metrics with Dual Adaptive Penalty for + Face Recognition + + +
+ In recent years, the emergence of deep convolutional neural networks has +positioned face recognition as a prominent research focus in computer vision. +Traditional loss functions, such as margin-based, hard-sample mining-based, and +hybrid approaches, have achieved notable performance improvements, with some +leveraging curriculum learning to optimize training. However, these methods +often fall short in effectively quantifying the difficulty of hard samples. To +address this, we propose Adaptive Sine (AdaSin) loss function, which introduces +the sine of the angle between a sample's embedding feature and its ground-truth +class center as a novel difficulty metric. This metric enables precise and +effective penalization of hard samples. By incorporating curriculum learning, +the model dynamically adjusts classification boundaries across different +training stages. Unlike previous adaptive-margin loss functions, AdaSin +introduce a dual adaptive penalty, applied to both the positive and negative +cosine similarities of hard samples. This design imposes stronger constraints, +enhancing intra-class compactness and inter-class separability. The combination +of the dual adaptive penalty and curriculum learning is guided by a +well-designed difficulty metric. It enables the model to focus more effectively +on hard samples in later training stages, and lead to the extraction of highly +discriminative face features. Extensive experiments across eight benchmarks +demonstrate that AdaSin achieves superior accuracy compared to other +state-of-the-art methods. + +
+
+
+
+
+ + ☆ NeuGrasp: Generalizable Neural Surface Reconstruction with Background + Priors for Material-Agnostic Object Grasp Detection ICRA + + +
+ Robotic grasping in scenes with transparent and specular objects presents +great challenges for methods relying on accurate depth information. In this +paper, we introduce NeuGrasp, a neural surface reconstruction method that +leverages background priors for material-agnostic grasp detection. NeuGrasp +integrates transformers and global prior volumes to aggregate multi-view +features with spatial encoding, enabling robust surface reconstruction in +narrow and sparse viewing conditions. By focusing on foreground objects through +residual feature enhancement and refining spatial perception with an +occupancy-prior volume, NeuGrasp excels in handling objects with transparent +and specular surfaces. Extensive experiments in both simulated and real-world +scenarios show that NeuGrasp outperforms state-of-the-art methods in grasping +while maintaining comparable reconstruction quality. More details are available +at https://neugrasp.github.io/. + +
+
+ comment: 7 pages, 5 figures. IEEE International Conference on Robotics and + Automation (ICRA) 2025 +
+
+
+
+
+ + ☆ Rethinking Synthetic Data definitions: A privacy driven approach + + +
+ Synthetic data is gaining traction as a cost-effective solution for the +increasing data demands of AI development and can be generated either from +existing knowledge or derived data captured from real-world events. The source +of the synthetic data generation and the technique used significantly impacts +its residual privacy risk and therefore its opportunity for sharing. +Traditional classification of synthetic data types no longer fit the newer +generation techniques and there is a need to better align the classification +with practical needs. We suggest a new way of grouping synthetic data types +that better supports privacy evaluations to aid regulatory policymaking. Our +novel classification provides flexibility to new advancements like deep +generative methods and offers a more practical framework for future +applications. + +
+
+
+
+
+ + ☆ Parallelized Planning-Acting for Efficient LLM-based Multi-Agent Systems + + +
+ Recent advancements in Large Language Model(LLM)-based Multi-Agent +Systems(MAS) have demonstrated remarkable potential for tackling complex +decision-making tasks. However, existing frameworks inevitably rely on +serialized execution paradigms, where agents must complete sequential LLM +planning before taking action. This fundamental constraint severely limits +real-time responsiveness and adaptation, which is crucial in dynamic +environments with ever-changing scenarios. In this paper, we propose a novel +parallelized planning-acting framework for LLM-based MAS, featuring a +dual-thread architecture with interruptible execution to enable concurrent +planning and acting. Specifically, our framework comprises two core threads:(1) +a planning thread driven by a centralized memory system, maintaining +synchronization of environmental states and agent communication to support +dynamic decision-making; and (2) an acting thread equipped with a comprehensive +skill library, enabling automated task execution through recursive +decomposition. Extensive experiments on challenging Minecraft demonstrate the +effectiveness of the proposed framework. + +
+
+
+
+
+ + ☆ Collaborative Expert LLMs Guided Multi-Objective Molecular Optimization + + +
+ Molecular optimization is a crucial yet complex and time-intensive process +that often acts as a bottleneck for drug development. Traditional methods rely +heavily on trial and error, making multi-objective optimization both +time-consuming and resource-intensive. Current AI-based methods have shown +limited success in handling multi-objective optimization tasks, hampering their +practical utilization. To address this challenge, we present MultiMol, a +collaborative large language model (LLM) system designed to guide +multi-objective molecular optimization. MultiMol comprises two agents, +including a data-driven worker agent and a literature-guided research agent. +The data-driven worker agent is a large language model being fine-tuned to +learn how to generate optimized molecules considering multiple objectives, +while the literature-guided research agent is responsible for searching +task-related literature to find useful prior knowledge that facilitates +identifying the most promising optimized candidates. In evaluations across six +multi-objective optimization tasks, MultiMol significantly outperforms existing +methods, achieving a 82.30% success rate, in sharp contrast to the 27.50% +success rate of current strongest methods. To further validate its practical +impact, we tested MultiMol on two real-world challenges. First, we enhanced the +selectivity of Xanthine Amine Congener (XAC), a promiscuous ligand that binds +both A1R and A2AR, successfully biasing it towards A1R. Second, we improved the +bioavailability of Saquinavir, an HIV-1 protease inhibitor with known +bioavailability limitations. Overall, these results indicate that MultiMol +represents a highly promising approach for multi-objective molecular +optimization, holding great potential to accelerate the drug development +process and contribute to the advancement of pharmaceutical research. + +
+
+
+
+
+ + ☆ CURVALID: Geometrically-guided Adversarial Prompt Detection + + +
+ Adversarial prompts capable of jailbreaking large language models (LLMs) and +inducing undesirable behaviours pose a significant obstacle to their safe +deployment. Current mitigation strategies rely on activating built-in defence +mechanisms or fine-tuning the LLMs, but the fundamental distinctions between +adversarial and benign prompts are yet to be understood. In this work, we +introduce CurvaLID, a novel defense framework that efficiently detects +adversarial prompts by leveraging their geometric properties. It is agnostic to +the type of LLM, offering a unified detection framework across diverse +adversarial prompts and LLM architectures. CurvaLID builds on the geometric +analysis of text prompts to uncover their underlying differences. We +theoretically extend the concept of curvature via the Whewell equation into an +$n$-dimensional word embedding space, enabling us to quantify local geometric +properties, including semantic shifts and curvature in the underlying +manifolds. Additionally, we employ Local Intrinsic Dimensionality (LID) to +capture geometric features of text prompts within adversarial subspaces. Our +findings reveal that adversarial prompts differ fundamentally from benign +prompts in terms of their geometric characteristics. Our results demonstrate +that CurvaLID delivers superior detection and rejection of adversarial queries, +paving the way for safer LLM deployment. The source code can be found at +https://github.com/Cancanxxx/CurvaLID + +
+
+ comment: 29 Pages, 5 figues +
+
+
+
+
+ + ☆ SafeVLA: Towards Safety Alignment of Vision-Language-Action Model via + Safe Reinforcement Learning + + +
+ Vision-language-action models (VLAs) have shown great potential as generalist +robot policies. However, these models pose urgent safety challenges during +deployment, including the risk of physical harm to the environment, the robot +itself, and humans. How can safety be explicitly incorporated into VLAs? In +this work, we propose SafeVLA, a novel algorithm designed to integrate safety +into VLAs, ensuring the protection of the environment, robot hardware and +humans in real-world settings. SafeVLA effectively balances safety and task +performance by employing large-scale constrained learning within simulated +environments. We demonstrate that SafeVLA outperforms the current +state-of-the-art method in both safety and task performance, achieving average +improvements of 83.58% and 3.85%, respectively, in simulation. By prioritizing +safety, our approach eliminates high-risk behaviors and reduces the upper bound +of unsafe behaviors to 1/35 of that in the current state-of-the-art, thereby +significantly mitigating long-tail risks. Furthermore, the learned safety +constraints generalize to diverse, unseen scenarios, including multiple +out-of-distribution perturbations and tasks. Our data, models and newly +proposed benchmark environment are available at +https://sites.google.com/view/pku-safevla. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ☆ Open-Source Large Language Models as Multilingual Crowdworkers: + Synthesizing Open-Domain Dialogues in Several Languages With No Examples in + Targets and No Machine Translation + + +
+ The prevailing paradigm in the domain of Open-Domain Dialogue agents +predominantly focuses on the English language, encompassing both models and +datasets. Furthermore, the financial and temporal investments required for +crowdsourcing such datasets for finetuning are substantial, particularly when +multiple languages are involved. Fortunately, advancements in Large Language +Models (LLMs) have unveiled a plethora of possibilities across diverse tasks. +Specifically, instruction-tuning has enabled LLMs to execute tasks based on +natural language instructions, occasionally surpassing the performance of human +crowdworkers. Additionally, these models possess the capability to function in +various languages within a single thread. Consequently, to generate new samples +in different languages, we propose leveraging these capabilities to replicate +the data collection process. We introduce a pipeline for generating Open-Domain +Dialogue data in multiple Target Languages using LLMs, with demonstrations +provided in a unique Source Language. By eschewing explicit Machine Translation +in this approach, we enhance the adherence to language-specific nuances. We +apply this methodology to the PersonaChat dataset. To enhance the openness of +generated dialogues and mimic real life scenarii, we added the notion of speech +events corresponding to the type of conversation the speakers are involved in +and also that of common ground which represents the premises of a conversation. + +
+
+
+
+
+ + ☆ Unified Mind Model: Reimagining Autonomous Agents in the LLM Era + + +
+ Large language models (LLMs) have recently demonstrated remarkable +capabilities across domains, tasks, and languages (e.g., ChatGPT and GPT-4), +reviving the research of general autonomous agents with human-like cognitive +abilities.Such human-level agents require semantic comprehension and +instruction-following capabilities, which exactly fall into the strengths of +LLMs.Although there have been several initial attempts to build human-level +agents based on LLMs, the theoretical foundation remains a challenging open +problem. In this paper, we propose a novel theoretical cognitive architecture, +the Unified Mind Model (UMM), which offers guidance to facilitate the rapid +creation of autonomous agents with human-level cognitive abilities. +Specifically, our UMM starts with the global workspace theory and further +leverage LLMs to enable the agent with various cognitive abilities, such as +multi-modal perception, planning, reasoning, tool use, learning, memory, +reflection and motivation. Building upon UMM, we then develop an agent-building +engine, MindOS, which allows users to quickly create domain-/task-specific +autonomous agents without any programming effort. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ☆ Taxation Perspectives from Large Language Models: A Case Study on + Additional Tax Penalties + + +
+ How capable are large language models (LLMs) in the domain of taxation? +Although numerous studies have explored the legal domain in general, research +dedicated to taxation remain scarce. Moreover, the datasets used in these +studies are either simplified, failing to reflect the real-world complexities, +or unavailable as open source. To address this gap, we introduce PLAT, a new +benchmark designed to assess the ability of LLMs to predict the legitimacy of +additional tax penalties. PLAT is constructed to evaluate LLMs' understanding +of tax law, particularly in cases where resolving the issue requires more than +just applying related statutes. Our experiments with six LLMs reveal that their +baseline capabilities are limited, especially when dealing with conflicting +issues that demand a comprehensive understanding. However, we found that +enabling retrieval, self-reasoning, and discussion among multiple agents with +specific role assignments, this limitation can be mitigated. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ Conceptualizing Uncertainty + + +
+ Uncertainty in machine learning refers to the degree of confidence or lack +thereof in a model's predictions. While uncertainty quantification methods +exist, explanations of uncertainty, especially in high-dimensional settings, +remain an open challenge. Existing work focuses on feature attribution +approaches which are restricted to local explanations. Understanding +uncertainty, its origins, and characteristics on a global scale is crucial for +enhancing interpretability and trust in a model's predictions. In this work, we +propose to explain the uncertainty in high-dimensional data classification +settings by means of concept activation vectors which give rise to local and +global explanations of uncertainty. We demonstrate the utility of the generated +explanations by leveraging them to refine and improve our model. + +
+
+
+
+
+ + ☆ RASD: Retrieval-Augmented Speculative Decoding + + +
+ Speculative decoding accelerates inference in large language models (LLMs) by +generating draft tokens for target model verification. Current approaches for +obtaining draft tokens rely on lightweight draft models or additional model +structures to generate draft tokens and retrieve context from databases. Due to +the draft model's small size and limited training data, model-based speculative +decoding frequently becomes less effective in out-of-domain scenarios. +Additionally, the time cost of the drafting phase results in a low upper limit +on acceptance length during the verification step, limiting overall efficiency. +This paper proposes RASD (Retrieval-Augmented Speculative Decoding), which +adopts retrieval methods to enhance model-based speculative decoding. We +introduce tree pruning and tree fusion to achieve this. Specifically, we +develop a pruning method based on the draft model's probability distribution to +construct the optimal retrieval tree. Second, we employ the longest prefix +matching algorithm to merge the tree generated by the draft model with the +retrieval tree, resulting in a unified tree for verification. Experimental +results demonstrate that RASD achieves state-of-the-art inference acceleration +across tasks such as DocQA, Summary, Code, and In-Domain QA. Moreover, RASD +exhibits strong scalability, seamlessly integrating with various speculative +decoding approaches, including both generation-based and retrieval-based +methods. + +
+
+
+
+
+ + ☆ Privacy is All You Need: Revolutionizing Wearable Health Data with + Advanced PETs + + +
+ In a world where data is the new currency, wearable health devices offer +unprecedented insights into daily life, continuously monitoring vital signs and +metrics. However, this convenience raises privacy concerns, as these devices +collect sensitive data that can be misused or breached. Traditional measures +often fail due to real-time data processing needs and limited device power. +Users also lack awareness and control over data sharing and usage. We propose a +Privacy-Enhancing Technology (PET) framework for wearable devices, integrating +federated learning, lightweight cryptographic methods, and selectively deployed +blockchain technology. The blockchain acts as a secure ledger triggered only +upon data transfer requests, granting users real-time notifications and +control. By dismantling data monopolies, this approach returns data sovereignty +to individuals. Through real-world applications like secure medical data +sharing, privacy-preserving fitness tracking, and continuous health monitoring, +our framework reduces privacy risks by up to 70 percent while preserving data +utility and performance. This innovation sets a new benchmark for wearable +privacy and can scale to broader IoT ecosystems, including smart homes and +industry. As data continues to shape our digital landscape, our research +underscores the critical need to maintain privacy and user control at the +forefront of technological progress. + +
+
+
+
+
+ + ☆ Simplicial SMOTE: Oversampling Solution to the Imbalanced Learning + Problem KDD 2025 + + +
+ SMOTE (Synthetic Minority Oversampling Technique) is the established +geometric approach to random oversampling to balance classes in the imbalanced +learning problem, followed by many extensions. Its idea is to introduce +synthetic data points of the minor class, with each new point being the convex +combination of an existing data point and one of its k-nearest neighbors. In +this paper, by viewing SMOTE as sampling from the edges of a geometric +neighborhood graph and borrowing tools from the topological data analysis, we +propose a novel technique, Simplicial SMOTE, that samples from the simplices of +a geometric neighborhood simplicial complex. A new synthetic point is defined +by the barycentric coordinates w.r.t. a simplex spanned by an arbitrary number +of data points being sufficiently close rather than a pair. Such a replacement +of the geometric data model results in better coverage of the underlying data +distribution compared to existing geometric sampling methods and allows the +generation of synthetic points of the minority class closer to the majority +class on the decision boundary. We experimentally demonstrate that our +Simplicial SMOTE outperforms several popular geometric sampling methods, +including the original SMOTE. Moreover, we show that simplicial sampling can be +easily integrated into existing SMOTE extensions. We generalize and evaluate +simplicial extensions of the classic Borderline SMOTE, Safe-level SMOTE, and +ADASYN algorithms, all of which outperform their graph-based counterparts. + +
+
+ comment: Accepted at KDD 2025 (research track) +
+
+
+
+
+ + ☆ When Claims Evolve: Evaluating and Enhancing the Robustness of Embedding + Models Against Misinformation Edits + + +
+ Online misinformation remains a critical challenge, and fact-checkers +increasingly rely on embedding-based methods to retrieve relevant fact-checks. +Yet, when debunked claims reappear in edited forms, the performance of these +methods is unclear. In this work, we introduce a taxonomy of six common +real-world misinformation edits and propose a perturbation framework that +generates valid, natural claim variations. Our multi-stage retrieval evaluation +reveals that standard embedding models struggle with user-introduced edits, +while LLM-distilled embeddings offer improved robustness at a higher +computational cost. Although a strong reranker helps mitigate some issues, it +cannot fully compensate for first-stage retrieval gaps. Addressing these +retrieval gaps, our train- and inference-time mitigation approaches enhance +in-domain robustness by up to 17 percentage points and boost out-of-domain +generalization by 10 percentage points over baseline models. Overall, our +findings provide practical improvements to claim-matching systems, enabling +more reliable fact-checking of evolving misinformation. + +
+
+
+
+
+ + ☆ Augmentation-Based Deep Learning for Identification of Circulating Tumor + Cells + + +
+ Circulating tumor cells (CTCs) are crucial biomarkers in liquid biopsy, +offering a noninvasive tool for cancer patient management. However, their +identification remains particularly challenging due to their limited number and +heterogeneity. Labeling samples for contrast limits the generalization of +fluorescence-based methods across different hospital datasets. Analyzing +single-cell images enables detailed assessment of cell morphology, subcellular +structures, and phenotypic variations, often hidden in clustered images. +Developing a method based on bright-field single-cell analysis could overcome +these limitations. CTCs can be isolated using an unbiased workflow combining +Parsortix technology, which selects cells based on size and deformability, with +DEPArray technology, enabling precise visualization and selection of single +cells. Traditionally, DEPArray-acquired digital images are manually analyzed, +making the process time-consuming and prone to variability. In this study, we +present a Deep Learning-based classification pipeline designed to distinguish +CTCs from leukocytes in blood samples, aimed to enhance diagnostic accuracy and +optimize clinical workflows. Our approach employs images from the bright-field +channel acquired through DEPArray technology leveraging a ResNet-based CNN. To +improve model generalization, we applied three types of data augmentation +techniques and incorporated fluorescence (DAPI) channel images into the +training phase, allowing the network to learn additional CTC-specific features. +Notably, only bright-field images have been used for testing, ensuring the +model's ability to identify CTCs without relying on fluorescence markers. The +proposed model achieved an F1-score of 0.798, demonstrating its capability to +distinguish CTCs from leukocytes. These findings highlight the potential of DL +in refining CTC analysis and advancing liquid biopsy applications. + +
+
+ comment: 20 pages, 4 figures, 3 tables +
+
+
+
+
+ + ☆ AI-Driven Multi-Stage Computer Vision System for Defect Detection in + Laser-Engraved Industrial Nameplates + + +
+ Automated defect detection in industrial manufacturing is essential for +maintaining product quality and minimizing production errors. In air disc brake +manufacturing, ensuring the precision of laser-engraved nameplates is crucial +for accurate product identification and quality control. Engraving errors, such +as misprints or missing characters, can compromise both aesthetics and +functionality, leading to material waste and production delays. This paper +presents a proof of concept for an AI-driven computer vision system that +inspects and verifies laser-engraved nameplates, detecting defects in logos and +alphanumeric strings. The system integrates object detection using YOLOv7, +optical character recognition (OCR) with Tesseract, and anomaly detection +through a residual variational autoencoder (ResVAE) along with other computer +vision methods to enable comprehensive inspections at multiple stages. +Experimental results demonstrate the system's effectiveness, achieving 91.33% +accuracy and 100% recall, ensuring that defective nameplates are consistently +detected and addressed. This solution highlights the potential of AI-driven +visual inspection to enhance quality control, reduce manual inspection efforts, +and improve overall manufacturing efficiency. + +
+
+
+
+
+ + ☆ Multi-Agent DRL for Queue-Aware Task Offloading in Hierarchical + MEC-Enabled Air-Ground Networks + + +
+ Mobile edge computing (MEC)-enabled air-ground networks are a key component +of 6G, employing aerial base stations (ABSs) such as unmanned aerial vehicles +(UAVs) and high-altitude platform stations (HAPS) to provide dynamic services +to ground IoT devices (IoTDs). These IoTDs support real-time applications +(e.g., multimedia and Metaverse services) that demand high computational +resources and strict quality of service (QoS) guarantees in terms of latency +and task queue management. Given their limited energy and processing +capabilities, IoTDs rely on UAVs and HAPS to offload tasks for distributed +processing, forming a multi-tier MEC system. This paper tackles the overall +energy minimization problem in MEC-enabled air-ground integrated networks +(MAGIN) by jointly optimizing UAV trajectories, computing resource allocation, +and queue-aware task offloading decisions. The optimization is challenging due +to the nonconvex, nonlinear nature of this hierarchical system, which renders +traditional methods ineffective. We reformulate the problem as a multi-agent +Markov decision process (MDP) with continuous action spaces and heterogeneous +agents, and propose a novel variant of multi-agent proximal policy optimization +with a Beta distribution (MAPPO-BD) to solve it. Extensive simulations show +that MAPPO-BD outperforms baseline schemes, achieving superior energy savings +and efficient resource management in MAGIN while meeting queue delay and edge +computing constraints. + +
+
+
+
+
+ + ☆ From Infants to AI: Incorporating Infant-like Learning in Models Boosts + Efficiency and Generalization in Learning Social Prediction Tasks + + +
+ Early in development, infants learn a range of useful concepts, which can be +challenging from a computational standpoint. This early learning comes together +with an initial understanding of aspects of the meaning of concepts, e.g., +their implications, causality, and using them to predict likely future events. +All this is accomplished in many cases with little or no supervision, and from +relatively few examples, compared with current network models. In learning +about objects and human-object interactions, early acquired and possibly innate +concepts are often used in the process of learning additional, more complex +concepts. In the current work, we model how early-acquired concepts are used in +the learning of subsequent concepts, and compare the results with standard deep +network modeling. We focused in particular on the use of the concepts of +animacy and goal attribution in learning to predict future events. We show that +the use of early concepts in the learning of new concepts leads to better +learning (higher accuracy) and more efficient learning (requiring less data). +We further show that this integration of early and new concepts shapes the +representation of the concepts acquired by the model. The results show that +when the concepts were learned in a human-like manner, the emerging +representation was more useful, as measured in terms of generalization to novel +data and tasks. On a more general level, the results suggest that there are +likely to be basic differences in the conceptual structures acquired by current +network models compared to human learning. + +
+
+
+
+
+ + ☆ Transformers for molecular property prediction: Domain adaptation + efficiently improves performance + + +
+ Most of the current transformer-based chemical language models are +pre-trained on millions to billions of molecules. However, the improvement from +such scaling in dataset size is not confidently linked to improved molecular +property prediction. The aim of this study is to investigate and overcome some +of the limitations of transformer models in predicting molecular properties. +Specifically, we examine the impact of pre-training dataset size and diversity +on the performance of transformer models and investigate the use of domain +adaptation as a technique for improving model performance. First, our findings +indicate that increasing pretraining dataset size beyond 400K molecules from +the GuacaMol dataset does not result in a significant improvement on four ADME +endpoints, namely, solubility, permeability, microsomal stability, and plasma +protein binding. Second, our results demonstrate that using domain adaptation +by further training the transformer model on a small set of domain-relevant +molecules, i.e., a few hundred to a few thousand, using multi-task regression +of physicochemical properties was sufficient to significantly improve +performance for three out of the four investigated ADME endpoints (P-value < +0.001). Finally, we observe that a model pre-trained on 400K molecules and +domain adopted on a few hundred/thousand molecules performs similarly (P-value +> 0.05) to more complicated transformer models like MolBERT(pre-trained on 1.3M +molecules) and MolFormer (pre-trained on 100M molecules). A comparison to a +random forest model trained on basic physicochemical properties showed similar +performance to the examined transformer models. We believe that current +transformer models can be improved through further systematic analysis of +pre-training and downstream data, pre-training objectives, and scaling laws, +ultimately leading to better and more helpful models. + +
+
+
+
+
+ + ☆ Leveraging Large Language Models to Develop Heuristics for Emerging + Optimization Problems + + +
+ Combinatorial optimization problems often rely on heuristic algorithms to +generate efficient solutions. However, the manual design of heuristics is +resource-intensive and constrained by the designer's expertise. Recent advances +in artificial intelligence, particularly large language models (LLMs), have +demonstrated the potential to automate heuristic generation through +evolutionary frameworks. Recent works focus only on well-known combinatorial +optimization problems like the traveling salesman problem and online bin +packing problem when designing constructive heuristics. This study investigates +whether LLMs can effectively generate heuristics for niche, not yet broadly +researched optimization problems, using the unit-load pre-marshalling problem +as an example case. We propose the Contextual Evolution of Heuristics (CEoH) +framework, an extension of the Evolution of Heuristics (EoH) framework, which +incorporates problem-specific descriptions to enhance in-context learning +during heuristic generation. Through computational experiments, we evaluate +CEoH and EoH and compare the results. Results indicate that CEoH enables +smaller LLMs to generate high-quality heuristics more consistently and even +outperform larger models. Larger models demonstrate robust performance with or +without contextualized prompts. The generated heuristics exhibit scalability to +diverse instance configurations. + +
+
+ comment: Under review LION19: The 19th Learning and Intelligent OptimizatioN + Conference +
+
+
+
+
+ + ☆ Navigating Intelligence: A Survey of Google OR-Tools and Machine + Learning for Global Path Planning in Autonomous Vehicles + + +
+ We offer a new in-depth investigation of global path planning (GPP) for +unmanned ground vehicles, an autonomous mining sampling robot named ROMIE. GPP +is essential for ROMIE's optimal performance, which is translated into solving +the traveling salesman problem, a complex graph theory challenge that is +crucial for determining the most effective route to cover all sampling +locations in a mining field. This problem is central to enhancing ROMIE's +operational efficiency and competitiveness against human labor by optimizing +cost and time. The primary aim of this research is to advance GPP by +developing, evaluating, and improving a cost-efficient software and web +application. We delve into an extensive comparison and analysis of Google +operations research (OR)-Tools optimization algorithms. Our study is driven by +the goal of applying and testing the limits of OR-Tools capabilities by +integrating Reinforcement Learning techniques for the first time. This enables +us to compare these methods with OR-Tools, assessing their computational +effectiveness and real-world application efficiency. Our analysis seeks to +provide insights into the effectiveness and practical application of each +technique. Our findings indicate that Q-Learning stands out as the optimal +strategy, demonstrating superior efficiency by deviating only 1.2% on average +from the optimal solutions across our datasets. + +
+
+
+
+
+ + ☆ See What You Are Told: Visual Attention Sink in Large Multimodal Models + + +
+ Large multimodal models (LMMs) "see" images by leveraging the attention +mechanism between text and visual tokens in the transformer decoder. Ideally, +these models should focus on key visual information relevant to the text token. +However, recent findings indicate that LMMs have an extraordinary tendency to +consistently allocate high attention weights to specific visual tokens, even +when these tokens are irrelevant to the corresponding text. In this study, we +investigate the property behind the appearance of these irrelevant visual +tokens and examine their characteristics. Our findings show that this behavior +arises due to the massive activation of certain hidden state dimensions, which +resembles the attention sink found in language models. Hence, we refer to this +phenomenon as the visual attention sink. In particular, our analysis reveals +that removing the irrelevant visual sink tokens does not impact model +performance, despite receiving high attention weights. Consequently, we recycle +the attention to these tokens as surplus resources, redistributing the +attention budget to enhance focus on the image. To achieve this, we introduce +Visual Attention Redistribution (VAR), a method that redistributes attention in +image-centric heads, which we identify as innately focusing on visual +information. VAR can be seamlessly applied across different LMMs to improve +performance on a wide range of tasks, including general vision-language tasks, +visual hallucination tasks, and vision-centric tasks, all without the need for +additional training, models, or inference steps. Experimental results +demonstrate that VAR enables LMMs to process visual information more +effectively by adjusting their internal attention mechanisms, offering a new +direction to enhancing the multimodal capabilities of LMMs. + +
+
+
+
+
+ + ☆ Exploring specialization and sensitivity of convolutional neural + networks in the context of simultaneous image augmentations + + +
+ Drawing parallels with the way biological networks are studied, we adapt the +treatment--control paradigm to explainable artificial intelligence research and +enrich it through multi-parametric input alterations. In this study, we propose +a framework for investigating the internal inference impacted by input data +augmentations. The internal changes in network operation are reflected in +activation changes measured by variance, which can be decomposed into +components related to each augmentation, employing Sobol indices and Shapley +values. These quantities enable one to visualize sensitivity to different +variables and use them for guided masking of activations. In addition, we +introduce a way of single-class sensitivity analysis where the candidates are +filtered according to their matching to prediction bias generated by targeted +damaging of the activations. Relying on the observed parallels, we assume that +the developed framework can potentially be transferred to studying biological +neural networks in complex environments. + +
+
+ comment: 26 pages; main text: 5 figures, 4 tables; appendix: 4 sections, 3 + tables; supplementary: 7 files (figures S1-S6: packed as 7z archive, S7: + single pdf file) +
+
+
+
+
+ + ☆ Benchmarking Dynamic SLO Compliance in Distributed Computing Continuum + Systems + + +
+ Ensuring Service Level Objectives (SLOs) in large-scale architectures, such +as Distributed Computing Continuum Systems (DCCS), is challenging due to their +heterogeneous nature and varying service requirements across different devices +and applications. Additionally, unpredictable workloads and resource +limitations lead to fluctuating performance and violated SLOs. To improve SLO +compliance in DCCS, one possibility is to apply machine learning; however, the +design choices are often left to the developer. To that extent, we provide a +benchmark of Active Inference -- an emerging method from neuroscience -- +against three established reinforcement learning algorithms (Deep Q-Network, +Advantage Actor-Critic, and Proximal Policy Optimization). We consider a +realistic DCCS use case: an edge device running a video conferencing +application alongside a WebSocket server streaming videos. Using one of the +respective algorithms, we continuously monitor key performance metrics, such as +latency and bandwidth usage, to dynamically adjust parameters -- including the +number of streams, frame rate, and resolution -- to optimize service quality +and user experience. To test algorithms' adaptability to constant system +changes, we simulate dynamically changing SLOs and both instant and gradual +data-shift scenarios, such as network bandwidth limitations and fluctuating +device thermal states. Although the evaluated algorithms all showed advantages +and limitations, our findings demonstrate that Active Inference is a promising +approach for ensuring SLO compliance in DCCS, offering lower memory usage, +stable CPU utilization, and fast convergence. + +
+
+
+
+
+ + ☆ Conformal Transformations for Symmetric Power Transformers SC + + +
+ Transformers with linear attention offer significant computational advantages +over softmax-based transformers but often suffer from degraded performance. The +symmetric power (sympow) transformer, a particular type of linear transformer, +addresses some of this performance gap by leveraging symmetric tensor +embeddings, achieving comparable performance to softmax transformers. However, +the finite capacity of the recurrent state in sympow transformers limits their +ability to retain information, leading to performance degradation when scaling +the training or evaluation context length. To address this issue, we propose +the conformal-sympow transformer, which dynamically frees up capacity using +data-dependent multiplicative gating and adaptively stores information using +data-dependent rotary embeddings. Preliminary experiments on the LongCrawl64 +dataset demonstrate that conformal-sympow overcomes the limitations of sympow +transformers, achieving robust performance across scaled training and +evaluation contexts. + +
+
+ comment: SCOPE Workshop at ICLR 2025 +
+
+
+
+
+ + ☆ Trajectory Prediction for Autonomous Driving: Progress, Limitations, and + Future Directions + + +
+ As the potential for autonomous vehicles to be integrated on a large scale +into modern traffic systems continues to grow, ensuring safe navigation in +dynamic environments is crucial for smooth integration. To guarantee safety and +prevent collisions, autonomous vehicles must be capable of accurately +predicting the trajectories of surrounding traffic agents. Over the past +decade, significant efforts from both academia and industry have been dedicated +to designing solutions for precise trajectory forecasting. These efforts have +produced a diverse range of approaches, raising questions about the differences +between these methods and whether trajectory prediction challenges have been +fully addressed. This paper reviews a substantial portion of recent trajectory +prediction methods and devises a taxonomy to classify existing solutions. A +general overview of the prediction pipeline is also provided, covering input +and output modalities, modeling features, and prediction paradigms discussed in +the literature. In addition, the paper discusses active research areas within +trajectory prediction, addresses the posed research questions, and highlights +the remaining research gaps and challenges. + +
+
+
+
+
+ + ☆ Exploring the Potential of Large Language Models as Predictors in + Dynamic Text-Attributed Graphs + + +
+ With the rise of large language models (LLMs), there has been growing +interest in Graph Foundation Models (GFMs) for graph-based tasks. By leveraging +LLMs as predictors, GFMs have demonstrated impressive generalizability across +various tasks and datasets. However, existing research on LLMs as predictors +has predominantly focused on static graphs, leaving their potential in dynamic +graph prediction unexplored. In this work, we pioneer using LLMs for predictive +tasks on dynamic graphs. We identify two key challenges: the constraints +imposed by context length when processing large-scale historical data and the +significant variability in domain characteristics, both of which complicate the +development of a unified predictor. To address these challenges, we propose the +GraphAgent-Dynamic (GAD) Framework, a multi-agent system that leverages +collaborative LLMs. In contrast to using a single LLM as the predictor, GAD +incorporates global and local summary agents to generate domain-specific +knowledge, enhancing its transferability across domains. Additionally, +knowledge reflection agents enable adaptive updates to GAD's knowledge, +maintaining a unified and self-consistent architecture. In experiments, GAD +demonstrates performance comparable to or even exceeds that of full-supervised +graph neural networks without dataset-specific training. Finally, to enhance +the task-specific performance of LLM-based predictors, we discuss potential +improvements, such as dataset-specific fine-tuning to LLMs. By developing +tailored strategies for different tasks, we provide new insights for the future +design of LLM-based predictors. + +
+
+
+
+
+ + ☆ Less is more? Rewards in RL for Cyber Defence + + +
+ The last few years has seen an explosion of interest in autonomous cyber +defence agents based on deep reinforcement learning. Such agents are typically +trained in a cyber gym environment, also known as a cyber simulator, at least +32 of which have already been built. Most, if not all cyber gyms provide dense +"scaffolded" reward functions which combine many penalties or incentives for a +range of (un)desirable states and costly actions. Whilst dense rewards help +alleviate the challenge of exploring complex environments, yielding seemingly +effective strategies from relatively few environment steps; they are also known +to bias the solutions an agent can find, potentially towards suboptimal +solutions. Sparse rewards could offer preferable or more effective solutions +and have been overlooked by cyber gyms to date. In this work we set out to +evaluate whether sparse reward functions might enable training more effective +cyber defence agents. Towards this goal we first break down several evaluation +limitations in existing work by proposing a ground truth evaluation score that +goes beyond the standard RL paradigm used to train and evaluate agents. By +adapting a well-established cyber gym to accommodate our methodology and ground +truth score, we propose and evaluate two sparse reward mechanisms and compare +them with a typical dense reward. Our evaluation considers a range of network +sizes, from 2 to 50 nodes, and both reactive and proactive defensive actions. +Our results show that sparse rewards, particularly positive reinforcement for +an uncompromised network state, enable the training of more effective cyber +defence agents. Furthermore, we show that sparse rewards provide more stable +training than dense rewards, and that both effectiveness and training stability +are robust to a variety of cyber environment considerations. + +
+
+ comment: 4 Pages +
+
+
+
+
+ + ☆ FANS -- Formal Answer Selection for Natural Language Math Reasoning + Using Lean4 + + +
+ Large Language Models (LLMs) have displayed astonishing abilities in various +tasks, especially in text generation, classification, question answering, etc. +However, the reasoning ability of LLMs still faces many debates. The inherent +ambiguity of Natural Language (NL) limits LLMs' ability to perform verifiable +reasoning, making its answers lack coherence and trustworthy support. To tackle +the above problems, we propose a novel framework named FANS: Formal ANswer +Selection for Natural Language Math Reasoning Using Lean4. To the best of our +knowledge, it is the first framework that utilizes Lean4 to enhance LLMs' NL +math reasoning ability. In particular, given an NL math question and +LLM-generated answers, FANS first translates it into Lean4 theorem statements. +Then it tries to prove it using a Lean4 prover and verify it by Lean4. Finally, +it uses the FL result to assist in answer selection. It enhances LLMs' NL math +ability in providing a computer-verifiable solution for its correct answer and +proposes an alternative method for answer selection beyond the reward model. +Extensive experiments indicate the effectiveness of our framework. It can +improve the accuracy rate of reward model enhanced LLMs in the MATH-500 dataset +by at most 1.91% and AMC-23 by at most 8.33% on strong reward-model baselines. +In some particular fields like number theory that Lean4 experts in, we can even +select all correct solutions. The qualitative analysis also shows our framework +can make NL results formally backed by Lean4 proofs. As a pioneering work in +the corresponding field, we will open-source all our models and datasets to +further boost the development of the field. + +
+
+
+
+
+ + ☆ COSINT-Agent: A Knowledge-Driven Multimodal Agent for Chinese Open + Source Intelligence + + +
+ Open Source Intelligence (OSINT) requires the integration and reasoning of +diverse multimodal data, presenting significant challenges in deriving +actionable insights. Traditional approaches, including multimodal large +language models (MLLMs), often struggle to infer complex contextual +relationships or deliver comprehensive intelligence from unstructured data +sources. In this paper, we introduce COSINT-Agent, a knowledge-driven +multimodal agent tailored to address the challenges of OSINT in the Chinese +domain. COSINT-Agent seamlessly integrates the perceptual capabilities of +fine-tuned MLLMs with the structured reasoning power of the Entity-Event-Scene +Knowledge Graph (EES-KG). Central to COSINT-Agent is the innovative EES-Match +framework, which bridges COSINT-MLLM and EES-KG, enabling systematic +extraction, reasoning, and contextualization of multimodal insights. This +integration facilitates precise entity recognition, event interpretation, and +context retrieval, effectively transforming raw multimodal data into actionable +intelligence. Extensive experiments validate the superior performance of +COSINT-Agent across core OSINT tasks, including entity recognition, EES +generation, and context matching. These results underscore its potential as a +robust and scalable solution for advancing automated multimodal reasoning and +enhancing the effectiveness of OSINT methodologies. + +
+
+
+
+
+ + ☆ NodeReg: Mitigating the Imbalance and Distribution Shift Effects in + Semi-Supervised Node Classification via Norm Consistency + + +
+ Aggregating information from neighboring nodes benefits graph neural networks +(GNNs) in semi-supervised node classification tasks. Nevertheless, this +mechanism also renders nodes susceptible to the influence of their neighbors. +For instance, this will occur when the neighboring nodes are imbalanced or the +neighboring nodes contain noise, which can even affect the GNN's ability to +generalize out of distribution. We find that ensuring the consistency of the +norm for node representations can significantly reduce the impact of these two +issues on GNNs. To this end, we propose a regularized optimization method +called NodeReg that enforces the consistency of node representation norms. This +method is simple but effective and satisfies Lipschitz continuity, thus +facilitating stable optimization and significantly improving semi-supervised +node classification performance under the above two scenarios. To illustrate, +in the imbalance scenario, when training a GCN with an imbalance ratio of 0.1, +NodeReg outperforms the most competitive baselines by 1.4%-25.9% in F1 score +across five public datasets. Similarly, in the distribution shift scenario, +NodeReg outperforms the most competitive baseline by 1.4%-3.1% in accuracy. + +
+
+
+
+
+ + ☆ MA-LoT: Multi-Agent Lean-based Long Chain-of-Thought Reasoning enhances + Formal Theorem Proving + + +
+ Solving mathematical problems using computer-verifiable languages like Lean +has significantly impacted mathematical and computer science communities. +State-of-the-art methods utilize single Large Language Models (LLMs) as agents +or provers to either generate complete proof or perform tree searches. However, +single-agent methods inherently lack a structured way to combine high-level +reasoning in Natural Language (NL) with Formal Language (FL) verification +feedback. To solve these issues, we propose MA-LoT: Multi-Agent Lean-based Long +Chain-of-Thought framework, (to the best of our knowledge), the first +multi-agent framework for Lean4 theorem proving that balance high-level NL +reasoning and FL verification in Long CoT. Using this structured interaction, +our approach enables deeper insights and long-term coherence in proof +generation, with which past methods struggle. We do this by leveraging emergent +formal reasoning ability in Long CoT using our novel LoT-Transfer Learning +training-inference pipeline. Extensive experiments show that our framework +achieves 54.51% accuracy rate on the Lean4 version of MiniF2F-Test dataset, +largely outperforming GPT-4 (22.95%), single-agent tree search +(InternLM-Step-Prover, 50.70%), and whole-proof generation +(DeepSeek-Prover-v1.5, 48.36%) baselines. Furthermore, our findings highlight +the potential of combining Long CoT with formal verification for a more +insightful generation in a broader perspective. + +
+
+
+
+
+ + ☆ Towards Robust Universal Information Extraction: Benchmark, Evaluation, + and Solution + + +
+ In this paper, we aim to enhance the robustness of Universal Information +Extraction (UIE) by introducing a new benchmark dataset, a comprehensive +evaluation, and a feasible solution. Existing robust benchmark datasets have +two key limitations: 1) They generate only a limited range of perturbations for +a single Information Extraction (IE) task, which fails to evaluate the +robustness of UIE models effectively; 2) They rely on small models or +handcrafted rules to generate perturbations, often resulting in unnatural +adversarial examples. Considering the powerful generation capabilities of Large +Language Models (LLMs), we introduce a new benchmark dataset for Robust UIE, +called RUIE-Bench, which utilizes LLMs to generate more diverse and realistic +perturbations across different IE tasks. Based on this dataset, we +comprehensively evaluate existing UIE models and reveal that both LLM-based +models and other models suffer from significant performance drops. To improve +robustness and reduce training costs, we propose a data-augmentation solution +that dynamically selects hard samples for iterative training based on the +model's inference loss. Experimental results show that training with only +\textbf{15\%} of the data leads to an average \textbf{7.5\%} relative +performance improvement across three IE tasks. + +
+
+
+
+
+ + ☆ Directly Follows Graphs Go Predictive Process Monitoring With Graph + Neural Networks + + +
+ In the past years, predictive process monitoring (PPM) techniques based on +artificial neural networks have evolved as a method to monitor the future +behavior of business processes. Existing approaches mostly focus on +interpreting the processes as sequences, so-called traces, and feeding them to +neural architectures designed to operate on sequential data such as recurrent +neural networks (RNNs) or transformers. In this study, we investigate an +alternative way to perform PPM: by transforming each process in its +directly-follows-graph (DFG) representation we are able to apply graph neural +networks (GNNs) for the prediction tasks. By this, we aim to develop models +that are more suitable for complex processes that are long and contain an +abundance of loops. In particular, we present different ways to create DFG +representations depending on the particular GNN we use. The tested GNNs range +from classical node-based to novel edge-based architectures. Further, we +investigate the possibility of using multi-graphs. By these steps, we aim to +design graph representations that minimize the information loss when +transforming traces into graphs. + +
+
+ comment: 10 pages, 4 figures, 3 tables +
+
+
+
+
+ + ☆ Structured Outputs Enable General-Purpose LLMs to be Medical Experts + + +
+ Medical question-answering (QA) is a critical task for evaluating how +effectively large language models (LLMs) encode clinical knowledge and +assessing their potential applications in medicine. Despite showing promise on +multiple-choice tests, LLMs frequently struggle with open-ended medical +questions, producing responses with dangerous hallucinations or lacking +comprehensive coverage of critical aspects. Existing approaches attempt to +address these challenges through domain-specific fine-tuning, but this proves +resource-intensive and difficult to scale across models. To improve the +comprehensiveness and factuality of medical responses, we propose a novel +approach utilizing structured medical reasoning. Our method guides LLMs through +an seven-step cognitive process inspired by clinical diagnosis, enabling more +accurate and complete answers without additional training. Experiments on the +MedLFQA benchmark demonstrate that our approach achieves the highest Factuality +Score of 85.8, surpassing fine-tuned models. Notably, this improvement +transfers to smaller models, highlighting the method's efficiency and +scalability. Our code and datasets are available. + +
+
+
+
+
+ + ☆ Intermediate-Task Transfer Learning: Leveraging Sarcasm Detection for + Stance Detection + + +
+ Stance Detection (SD) on social media has emerged as a prominent area of +interest with implications for social business and political applications +thereby garnering escalating research attention within NLP. The inherent +subtlety and complexity of texts procured from online platforms pose challenges +for SD algorithms in accurately discerning the authors stance. Mostly the +inclusion of sarcastic and figurative language drastically impacts the +performance of SD models. This paper addresses this by employing sarcasm +detection intermediate-task transfer learning tailored for SD. The proposed +methodology involves the finetuning of BERT and RoBERTa and the concatenation +of convolutional BiLSTM and dense layers. Rigorous experiments are conducted on +publicly available datasets to evaluate our transfer-learning framework. The +performance of the approach is assessed against various State-Of-The-Art +baselines for SD providing empirical evidence of its effectiveness. Notably our +model outperforms the best SOTA models even prior to sarcasm-detection +pretraining. The integration of sarcasm knowledge into the model proves +instrumental in mitigating misclassifications of sarcastic textual elements in +SD. Our model accurately predicts 85% of texts that were previously +misclassified by the model without sarcasm-detection pretraining thereby +amplifying the average F1-score of the model. Our experiments also revealed +that the success of the transfer-learning framework is contingent upon the +correlation of lexical attributes between the intermediate task and the target +task. This study represents the first exploration of sarcasm detection as an +intermediate transfer-learning task in the context of SD and simultaneously +uses the concatenation of BERT or RoBERTa with other deep-learning techniques +establishing the proposed approach as a foundational baseline for future +research endeavors in this domain. + +
+
+ comment: 8 pages, 2 figures, published in The Sixteenth International + Conference on Information (eKNOW 2024) +
+
+
+
+
+ + ☆ AttackSeqBench: Benchmarking Large Language Models' Understanding of + Sequential Patterns in Cyber Attacks + + +
+ The observations documented in Cyber Threat Intelligence (CTI) reports play a +critical role in describing adversarial behaviors, providing valuable insights +for security practitioners to respond to evolving threats. Recent advancements +of Large Language Models (LLMs) have demonstrated significant potential in +various cybersecurity applications, including CTI report understanding and +attack knowledge graph construction. While previous works have proposed +benchmarks that focus on the CTI extraction ability of LLMs, the sequential +characteristic of adversarial behaviors within CTI reports remains largely +unexplored, which holds considerable significance in developing a comprehensive +understanding of how adversaries operate. To address this gap, we introduce +AttackSeqBench, a benchmark tailored to systematically evaluate LLMs' +capability to understand and reason attack sequences in CTI reports. Our +benchmark encompasses three distinct Question Answering (QA) tasks, each task +focuses on the varying granularity in adversarial behavior. To alleviate the +laborious effort of QA construction, we carefully design an automated dataset +construction pipeline to create scalable and well-formulated QA datasets based +on real-world CTI reports. To ensure the quality of our dataset, we adopt a +hybrid approach of combining human evaluation and systematic evaluation +metrics. We conduct extensive experiments and analysis with both fast-thinking +and slow-thinking LLMs, while highlighting their strengths and limitations in +analyzing the sequential patterns in cyber attacks. The overarching goal of +this work is to provide a benchmark that advances LLM-driven CTI report +understanding and fosters its application in real-world cybersecurity +operations. Our dataset and code are available at +https://github.com/Javiery3889/AttackSeqBench . + +
+
+
+
+
+ + ☆ DiRe-JAX: A JAX based Dimensionality Reduction Algorithm for Large-scale + Data + + +
+ DiRe-JAX is a new dimensionality reduction toolkit designed to address some +of the challenges faced by traditional methods like UMAP and tSNE such as loss +of global structure and computational efficiency. Built on the JAX framework, +DiRe leverages modern hardware acceleration to provide an efficient, scalable, +and interpretable solution for visualizing complex data structures, and for +quantitative analysis of lower-dimensional embeddings. The toolkit shows +considerable promise in preserving both local and global structures within the +data as compare to state-of-the-art UMAP and tSNE implementations. This makes +it suitable for a wide range of applications in machine learning, +bioinformatics, and data science. + +
+
+ comment: 22 pages, 12 figures; Github repository available at + https://github.com/sashakolpakov/dire-jax; package available on PyPi + https://pypi.org/project/dire-jax/ +
+
+
+
+
+ + ☆ Position: Model Collapse Does Not Mean What You Think + + +
+ The proliferation of AI-generated content online has fueled concerns over +\emph{model collapse}, a degradation in future generative models' performance +when trained on synthetic data generated by earlier models. Industry leaders, +premier research journals and popular science publications alike have +prophesied catastrophic societal consequences stemming from model collapse. In +this position piece, we contend this widespread narrative fundamentally +misunderstands the scientific evidence. We highlight that research on model +collapse actually encompasses eight distinct and at times conflicting +definitions of model collapse, and argue that inconsistent terminology within +and between papers has hindered building a comprehensive understanding of model +collapse. To assess how significantly different interpretations of model +collapse threaten future generative models, we posit what we believe are +realistic conditions for studying model collapse and then conduct a rigorous +assessment of the literature's methodologies through this lens. While we leave +room for reasonable disagreement, our analysis of research studies, weighted by +how faithfully each study matches real-world conditions, leads us to conclude +that certain predicted claims of model collapse rely on assumptions and +conditions that poorly match real-world conditions, and in fact several +prominent collapse scenarios are readily avoidable. Altogether, this position +paper argues that model collapse has been warped from a nuanced multifaceted +consideration into an oversimplified threat, and that the evidence suggests +specific harms more likely under society's current trajectory have received +disproportionately less attention. + +
+
+
+
+
+ + ☆ Partial Convolution Meets Visual Attention + + +
+ Designing an efficient and effective neural network has remained a prominent +topic in computer vision research. Depthwise onvolution (DWConv) is widely used +in efficient CNNs or ViTs, but it needs frequent memory access during +inference, which leads to low throughput. FasterNet attempts to introduce +partial convolution (PConv) as an alternative to DWConv but compromises the +accuracy due to underutilized channels. To remedy this shortcoming and consider +the redundancy between feature map channels, we introduce a novel Partial +visual ATtention mechanism (PAT) that can efficiently combine PConv with visual +attention. Our exploration indicates that the partial attention mechanism can +completely replace the full attention mechanism and reduce model parameters and +FLOPs. Our PAT can derive three types of blocks: Partial Channel-Attention +block (PAT_ch), Partial Spatial-Attention block (PAT_sp) and Partial +Self-Attention block (PAT_sf). First, PAT_ch integrates the enhanced Gaussian +channel attention mechanism to infuse global distribution information into the +untouched channels of PConv. Second, we introduce the spatial-wise attention to +the MLP layer to further improve model accuracy. Finally, we replace PAT_ch in +the last stage with the self-attention mechanism to extend the global receptive +field. Building upon PAT, we propose a novel hybrid network family, named +PATNet, which achieves superior top-1 accuracy and inference speed compared to +FasterNet on ImageNet-1K classification and excel in both detection and +segmentation on the COCO dataset. Particularly, our PATNet-T2 achieves 1.3% +higher accuracy than FasterNet-T2, while exhibiting 25% higher GPU throughput +and 24% lower CPU latency. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2502.01303 +
+
+
+
+
+ + ☆ Knowledge Augmentation in Federation: Rethinking What Collaborative + Learning Can Bring Back to Decentralized Data + + +
+ Data, as an observable form of knowledge, has become one of the most +important factors of production for the development of Artificial Intelligence +(AI). Meanwhile, increasing legislation and regulations on private and +proprietary information results in scattered data sources also known as the +``data islands''. Although some collaborative learning paradigms such as +Federated Learning (FL) can enable privacy-preserving training over +decentralized data, they have inherent deficiencies in fairness, costs and +reproducibility because of being learning-centric, which greatly limits the way +how participants cooperate with each other. In light of this, we present a +knowledge-centric paradigm termed \emph{Knowledge Augmentation in Federation} +(KAF), with focus on how to enhance local knowledge through collaborative +effort. We provide the suggested system architecture, formulate the +prototypical optimization objective, and review emerging studies that employ +methodologies suitable for KAF. On our roadmap, with a three-way categorization +we describe the methods for knowledge expansion, knowledge filtering, and label +and feature space correction in the federation. Further, we highlight several +challenges and open questions that deserve more attention from the community. +With our investigation, we intend to offer new insights for what collaborative +learning can bring back to decentralized data. + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ Convergence Analysis of Federated Learning Methods Using Backward Error + Analysis + + +
+ Backward error analysis allows finding a modified loss function, which the +parameter updates really follow under the influence of an optimization method. +The additional loss terms included in this modified function is called implicit +regularizer. In this paper, we attempt to find the implicit regularizer for +various federated learning algorithms on non-IID data distribution, and explain +why each method shows different convergence behavior. We first show that the +implicit regularizer of FedAvg disperses the gradient of each client from the +average gradient, thus increasing the gradient variance. We also empirically +show that the implicit regularizer hampers its convergence. Similarly, we +compute the implicit regularizers of FedSAM and SCAFFOLD, and explain why they +converge better. While existing convergence analyses focus on pointing out the +advantages of FedSAM and SCAFFOLD, our approach can explain their limitations +in complex non-convex settings. In specific, we demonstrate that FedSAM can +partially remove the bias in the first-order term of the implicit regularizer +in FedAvg, whereas SCAFFOLD can fully eliminate the bias in the first-order +term, but not in the second-order term. Consequently, the implicit regularizer +can provide a useful insight on the convergence behavior of federated learning +from a different theoretical perspective. + +
+
+
+
+
+ + ☆ L2R: Learning to Reduce Search Space for Generalizable Neural Routing + Solver + + +
+ Constructive neural combinatorial optimization (NCO) has attracted growing +research attention due to its ability to solve complex routing problems without +relying on handcrafted rules. However, existing NCO methods face significant +challenges in generalizing to large-scale problems due to high computational +complexity and inefficient capture of structural patterns. To address this +issue, we propose a novel learning-based search space reduction method that +adaptively selects a small set of promising candidate nodes at each step of the +constructive NCO process. Unlike traditional methods that rely on fixed +heuristics, our selection model dynamically prioritizes nodes based on learned +patterns, significantly reducing the search space while maintaining solution +quality. Experimental results demonstrate that our method, trained solely on +100-node instances from uniform distribution, generalizes remarkably well to +large-scale Traveling Salesman Problem (TSP) and Capacitated Vehicle Routing +Problem (CVRP) instances with up to 1 million nodes from the uniform +distribution and over 80K nodes from other distributions. + +
+
+ comment: 23 pages, 10 figures +
+
+
+
+
+ + ☆ Exploring Neural Ordinary Differential Equations as Interpretable + Healthcare classifiers ACL + + +
+ Deep Learning has emerged as one of the most significant innovations in +machine learning. However, a notable limitation of this field lies in the +``black box" decision-making processes, which have led to skepticism within +groups like healthcare and scientific communities regarding its applicability. +In response, this study introduces a interpretable approach using Neural +Ordinary Differential Equations (NODEs), a category of neural network models +that exploit the dynamics of differential equations for representation +learning. Leveraging their foundation in differential equations, we illustrate +the capability of these models to continuously process textual data, marking +the first such model of its kind, and thereby proposing a promising direction +for future research in this domain. The primary objective of this research is +to propose a novel architecture for groups like healthcare that require the +predictive capabilities of deep learning while emphasizing the importance of +model transparency demonstrated in NODEs. + +
+
+ comment: ACL SRW Submission +
+
+
+
+
+ + ☆ Towards Understanding Multi-Round Large Language Model Reasoning: + Approximability, Learnability and Generalizability + + +
+ Recent advancements in cognitive science and multi-round reasoning techniques +for Large Language Models (LLMs) suggest that iterative thinking processes +improve problem-solving performance in complex tasks. Inspired by this, +approaches like Chain-of-Thought, debating, and self-refinement have been +applied to auto-regressive LLMs, achieving significant successes in tasks such +as mathematical reasoning, commonsense reasoning, and multi-hop question +answering. Despite these successes, the theoretical basis for how multi-round +reasoning enhances problem-solving abilities remains underexplored. In this +work, we investigate the approximation, learnability, and generalization +properties of multi-round auto-regressive models. We show that Transformers +with finite context windows are universal approximators for steps of +Turing-computable functions and can approximate any Turing-computable +sequence-to-sequence function through multi-round reasoning. We extend PAC +learning to sequence generation and demonstrate that multi-round generation is +learnable even when the sequence length exceeds the model's context window. +Finally, we examine how generalization error propagates across rounds, and show +how the aforementioned approaches can help constrain this error, ensuring +outputs stay within an expectation boundary. This work sheds light on the +systemic theoretical foundations of multi-round sequence learning and +reasoning, emphasizing its role in inference complexity. + +
+
+
+
+
+ + ☆ The Devil Is in the Details: Tackling Unimodal Spurious Correlations for + Generalizable Multimodal Reward Models + + +
+ Multimodal Reward Models (MM-RMs) are crucial for aligning Large Language +Models (LLMs) with human preferences, particularly as LLMs increasingly +interact with multimodal data. However, we find that MM-RMs trained on existing +datasets often struggle to generalize to out-of-distribution data due to their +reliance on unimodal spurious correlations, primarily text-only shortcuts +within the training distribution, which prevents them from leveraging true +multimodal reward functions. To address this, we introduce a Shortcut-aware +MM-RM learning algorithm that mitigates this issue by dynamically reweighting +training samples, shifting the distribution toward better multimodal +understanding, and reducing dependence on unimodal spurious correlations. Our +experiments demonstrate significant improvements in generalization, downstream +task performance, and scalability, establishing a more robust framework for +multimodal reward modeling. + +
+
+
+
+
+ + ☆ A Multimodal Framework for Topic Propagation Classification in Social + Networks + + +
+ The rapid proliferation of the Internet and the widespread adoption of social +networks have significantly accelerated information dissemination. However, +this transformation has introduced complexities in information capture and +processing, posing substantial challenges for researchers and practitioners. +Predicting the dissemination of topic-related information within social +networks has thus become a critical research focus. This paper proposes a +predictive model for topic dissemination in social networks by integrating +multidimensional features derived from key dissemination characteristics. +Specifically, we introduce two novel indicators, user relationship breadth and +user authority, into the PageRank algorithm to quantify user influence more +effectively. Additionally, we employ a Text-CNN model for sentiment +classification, extracting sentiment features from textual content. Temporal +embeddings of nodes are encoded using a Bi-LSTM model to capture temporal +dynamics. Furthermore, we refine the measurement of user interaction traces +with topics, replacing traditional topic view metrics with a more precise +communication characteristics measure. Finally, we integrate the extracted +multidimensional features using a Transformer model, significantly enhancing +predictive performance. Experimental results demonstrate that our proposed +model outperforms traditional machine learning and unimodal deep learning +models in terms of FI-Score, AUC, and Recall, validating its effectiveness in +predicting topic propagation within social networks. + +
+
+
+
+
+ + ☆ SoK: Knowledge is All You Need: Last Mile Delivery for Automated + Provenance-based Intrusion Detection with LLMs + + +
+ Recently, provenance-based intrusion detection systems (PIDSes) have been +widely proposed for endpoint threat analysis. However, due to the lack of +systematic integration and utilization of knowledge, existing PIDSes still +require significant manual intervention for practical deployment, making full +automation challenging. This paper presents a disruptive innovation by +categorizing PIDSes according to the types of knowledge they utilize. In +response to the prevalent issue of ``knowledge silos problem'' in existing +research, we introduce a novel knowledge-driven provenance-based intrusion +detection framework, powered by large language models (LLMs). We also present +OmniSec, a best practice system built upon this framework. By integrating +attack representation knowledge, threat intelligence knowledge, and benign +behavior knowledge, OmniSec outperforms the state-of-the-art approaches on +public benchmark datasets. OmniSec is available online at +https://anonymous.4open.science/r/PIDS-with-LLM-613B. + +
+
+
+
+
+ + ☆ External Reliable Information-enhanced Multimodal Contrastive Learning + for Fake News Detection AAAI'25 + + +
+ With the rapid development of the Internet, the information dissemination +paradigm has changed and the efficiency has been improved greatly. While this +also brings the quick spread of fake news and leads to negative impacts on +cyberspace. Currently, the information presentation formats have evolved +gradually, with the news formats shifting from texts to multimodal contents. As +a result, detecting multimodal fake news has become one of the research +hotspots. However, multimodal fake news detection research field still faces +two main challenges: the inability to fully and effectively utilize multimodal +information for detection, and the low credibility or static nature of the +introduced external information, which limits dynamic updates. To bridge the +gaps, we propose ERIC-FND, an external reliable information-enhanced multimodal +contrastive learning framework for fake news detection. ERIC-FND strengthens +the representation of news contents by entity-enriched external information +enhancement method. It also enriches the multimodal news information via +multimodal semantic interaction method where the multimodal constrative +learning is employed to make different modality representations learn from each +other. Moreover, an adaptive fusion method is taken to integrate the news +representations from different dimensions for the eventual classification. +Experiments are done on two commonly used datasets in different languages, X +(Twitter) and Weibo. Experiment results demonstrate that our proposed model +ERIC-FND outperforms existing state-of-the-art fake news detection methods +under the same settings. + +
+
+ comment: accepted by AAAI'25 +
+
+
+
+
+ + ♻ ☆ CDS: Data Synthesis Method Guided by Cognitive Diagnosis Theory + + +
+ Large Language Models (LLMs) have achieved significant advancements, but the +increasing complexity of tasks and higher performance demands highlight the +need for continuous improvement. Some approaches utilize synthetic data +generated by advanced LLMs based on evaluation results to train models. +However, conventional evaluation methods fail to provide detailed, fine-grained +profiles of LLMs, limiting their guidance for data synthesis. In this paper, we +introduce the Cognitive Diagnostic Synthesis (CDS) method, which incorporates a +diagnostic process inspired by Cognitive Diagnosis Theory (CDT) to refine +evaluation results and characterize model profiles at the knowledge component +level. Based on these diagnostics, we propose two diagnosis-synthesis +strategies for weakness-targeted data synthesis. Additionally, we present an +enhanced data augmentation and selection pipeline to improve the quality and +diversity of synthesized data. Our experiments with several open-source models +show significant improvements across multiple benchmarks, achieving up to 6.00% +improvement in code generation, 13.10% in mathematical reasoning, and 5.43% in +academic exams. Code and data are available on GitHub. + +
+
+
+
+
+ + ♻ ☆ Interactive Data Harmonization with LLM Agents + + +
+ Data harmonization is an essential task that entails integrating datasets +from diverse sources. Despite years of research in this area, it remains a +time-consuming and challenging task due to schema mismatches, varying +terminologies, and differences in data collection methodologies. This paper +presents the case for agentic data harmonization as a means to both empower +experts to harmonize their data and to streamline the process. We introduce +Harmonia, a system that combines LLM-based reasoning, an interactive user +interface, and a library of data harmonization primitives to automate the +synthesis of data harmonization pipelines. We demonstrate Harmonia in a +clinical data harmonization scenario, where it helps to interactively create +reusable pipelines that map datasets to a standard format. Finally, we discuss +challenges and open problems, and suggest research directions for advancing our +vision. + +
+
+
+
+
+ + ♻ ☆ PARAMANU-GANITA: Can Small Math Language Models Rival with Large + Language Models on Mathematical Reasoning? + + +
+ In this paper, we study whether domain specific pretraining of small +generative language models (SLM) from scratch with domain specialized tokenizer +and Chain-of-Thought (CoT) instruction fine-tuning results in competitive +performance on mathematical reasoning compared to LLMs? Secondly, whether this +approach is environmentally sustainable, highly cost efficient? To address +these research questions, we present Paramanu-Ganita, a 208 million-parameter +novel decoder-only Auto Regressive SLM on mathematics. We performed pretraining +from scratch on 31.5 billion tokens for 170 A100 hours using a context size of +4096 on a mixed mathematical corpus consisting of web pages, source code, +textbooks, CoT templatised StackOverflow QA pairs, and mathematical lecture +notes in LaTeX curated by us. We also trained a math and code specialised BPE +tokenizer. We proposed and performed CoT instruction fine-tuning of +Paramanu-Ganita on the MetaMathQA dataset. Our model Paramanu-Ganita, despite +being 34 times smaller than the 7B LLMs, outperforms generalist LLMs by +approximately 30% points, and even math-specialised LLMs by 3-23% points in +GSM8K test accuracy metric. On MATH benchmark, Paramanu-Ganita outperformed the +various models by 6-8% points. On benchmarks like LogiQA, MMLU (high school, +college level), and competitive exams level, AGIEVAL (AQuA-RAT, SAT-Math), +Paramanu-Ganita outperformed others by 1-4%. Our model is available at +https://huggingface.co/gyanai/paramanu-ganita-208M-hf . + +
+
+
+
+
+ + ♻ ☆ Neural DNF-MT: A Neuro-symbolic Approach for Learning Interpretable and + Editable Policies AAMAS 2025 + + +
+ Although deep reinforcement learning has been shown to be effective, the +model's black-box nature presents barriers to direct policy interpretation. To +address this problem, we propose a neuro-symbolic approach called neural DNF-MT +for end-to-end policy learning. The differentiable nature of the neural DNF-MT +model enables the use of deep actor-critic algorithms for training. At the same +time, its architecture is designed so that trained models can be directly +translated into interpretable policies expressed as standard (bivalent or +probabilistic) logic programs. Moreover, additional layers can be included to +extract abstract features from complex observations, acting as a form of +predicate invention. The logic representations are highly interpretable, and we +show how the bivalent representations of deterministic policies can be edited +and incorporated back into a neural model, facilitating manual intervention and +adaptation of learned policies. We evaluate our approach on a range of tasks +requiring learning deterministic or stochastic behaviours from various forms of +observations. Our empirical results show that our neural DNF-MT model performs +at the level of competing black-box methods whilst providing interpretable +policies. + +
+
+ comment: AAMAS 2025 (with Appendix) +
+
+
+
+
+ + ♻ ☆ Beyond Matryoshka: Revisiting Sparse Coding for Adaptive Representation + + +
+ Many large-scale systems rely on high-quality deep representations +(embeddings) to facilitate tasks like retrieval, search, and generative +modeling. Matryoshka Representation Learning (MRL) recently emerged as a +solution for adaptive embedding lengths, but it requires full model retraining +and suffers from noticeable performance degradations at short lengths. In this +paper, we show that sparse coding offers a compelling alternative for achieving +adaptive representation with minimal overhead and higher fidelity. We propose +Contrastive Sparse Representation (CSR), a method that sparsifies pre-trained +embeddings into a high-dimensional but selectively activated feature space. By +leveraging lightweight autoencoding and task-aware contrastive objectives, CSR +preserves semantic quality while allowing flexible, cost-effective inference at +different sparsity levels. Extensive experiments on image, text, and multimodal +benchmarks demonstrate that CSR consistently outperforms MRL in terms of both +accuracy and retrieval speed-often by large margins-while also cutting training +time to a fraction of that required by MRL. Our results establish sparse coding +as a powerful paradigm for adaptive representation learning in real-world +applications where efficiency and fidelity are both paramount. Code is +available at https://github.com/neilwen987/CSR_Adaptive_Rep + +
+
+ comment: A novel sparse coding framework designed for learning adaptive + representation +
+
+
+
+
+ + ♻ ☆ DelTA: An Online Document-Level Translation Agent Based on Multi-Level + Memory ICLR 2025 + + +
+ Large language models (LLMs) have achieved reasonable quality improvements in +machine translation (MT). However, most current research on MT-LLMs still faces +significant challenges in maintaining translation consistency and accuracy when +processing entire documents. In this paper, we introduce DelTA, a +Document-levEL Translation Agent designed to overcome these limitations. DelTA +features a multi-level memory structure that stores information across various +granularities and spans, including Proper Noun Records, Bilingual Summary, +Long-Term Memory, and Short-Term Memory, which are continuously retrieved and +updated by auxiliary LLM-based components. Experimental results indicate that +DelTA significantly outperforms strong baselines in terms of translation +consistency and quality across four open/closed-source LLMs and two +representative document translation datasets, achieving an increase in +consistency scores by up to 4.58 percentage points and in COMET scores by up to +3.16 points on average. DelTA employs a sentence-by-sentence translation +strategy, ensuring no sentence omissions and offering a memory-efficient +solution compared to the mainstream method. Furthermore, DelTA improves pronoun +and context-dependent translation accuracy, and the summary component of the +agent also shows promise as a tool for query-based summarization tasks. The +code and data of our approach are released at +https://github.com/YutongWang1216/DocMTAgent. + +
+
+ comment: Accepted as a conference paper at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ PyGen: A Collaborative Human-AI Approach to Python Package Creation + + +
+ The principles of automation and innovation serve as foundational elements +for advancement in contemporary science and technology. Here, we introduce +Pygen, an automation platform designed to empower researchers, technologists, +and hobbyists to bring abstract ideas to life as core, usable software tools +written in Python. Pygen leverages the immense power of autoregressive large +language models to augment human creativity during the ideation, iteration, and +innovation process. By combining state-of-the-art language models with +open-source code generation technologies, Pygen has significantly reduced the +manual overhead of tool development. From a user prompt, Pygen automatically +generates Python packages for a complete workflow from concept to package +generation and documentation. The findings of our work show that Pygen +considerably enhances the researcher's productivity by enabling the creation of +resilient, modular, and well-documented packages for various specialized +purposes. We employ a prompt enhancement approach to distill the user's package +description into increasingly specific and actionable. While being inherently +an open-ended task, we have evaluated the generated packages and the +documentation using Human Evaluation, LLM-based evaluation, and CodeBLEU, with +detailed results in the results section. Furthermore, we documented our +results, analyzed the limitations, and suggested strategies to alleviate them. +Pygen is our vision of ethical automation, a framework that promotes +inclusivity, accessibility, and collaborative development. This project marks +the beginning of a large-scale effort towards creating tools where intelligent +agents collaborate with humans to improve scientific and technological +development substantially. + Our code and generated examples are open-sourced at +[https://github.com/GitsSaikat/Pygen] + +
+
+ comment: 33 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Bonsai: Gradient-free Graph Distillation for Node Classification + + +
+ Graph distillation has emerged as a promising avenue to enable scalable +training of GNNs by compressing the training dataset while preserving essential +graph characteristics. Our study uncovers significant shortcomings in current +graph distillation techniques. First, the majority of the algorithms +paradoxically require training on the full dataset to perform distillation. +Second, due to their gradient-emulating approach, these methods require fresh +distillation for any change in hyperparameters or GNN architecture, limiting +their flexibility and reusability. Finally, they fail to achieve substantial +size reduction due to synthesizing fully-connected, edge-weighted graphs. To +address these challenges, we present Bonsai, a novel graph distillation method +empowered by the observation that \textit{computation trees} form the +fundamental processing units of message-passing GNNs. Bonsai distills datasets +by encoding a careful selection of \textit{exemplar} trees that maximize the +representation of all computation trees in the training set. This unique +approach imparts Bonsai as the first linear-time, model-agnostic graph +distillation algorithm for node classification that outperforms existing +baselines across $6$ real-world datasets on accuracy, while being $22$ times +faster on average. Bonsai is grounded in rigorous mathematical guarantees on +the adopted approximation strategies making it robust to GNN architectures, +datasets, and parameters. + +
+
+
+
+
+ + ♻ ☆ A privacy-preserving, distributed and cooperative FCM-based learning + approach for cancer research + + +
+ Distributed Artificial Intelligence is attracting interest day by day. In +this paper, the authors introduce an innovative methodology for distributed +learning of Particle Swarm Optimization-based Fuzzy Cognitive Maps in a +privacy-preserving way. The authors design a training scheme for collaborative +FCM learning that offers data privacy compliant with the current regulation. +This method is applied to a cancer detection problem, proving that the +performance of the model is improved by the Federated Learning process, and +obtaining similar results to the ones that can be found in the literature. + +
+
+ comment: Rough Sets: International Joint Conference, IJCRS 2020 +
+
+
+
+
+ + ♻ ☆ SMAC-R1: The Emergence of Intelligence in Decision-Making Tasks + + +
+ StarCraft Multi-Agent Challenge (SMAC) has been one of the most commonly used +experimental environments in multi-agent reinforcement learning (MARL), where +the specific task is to control a set number of allied units to defeat enemy +forces. Traditional MARL algorithms often require interacting with the +environment for millions of steps to train a parametric model, of which the +resulting policies are typically non-interpretable with weak transferability. +In this paper, we introduce SMAC-R1 which is based on the Qwen2.5-7B-Base LLM +distilled from DeepSeek-Coder-v2.5-236B. Similar to online reinforcement +learning after behavior cloning in offline learning process, in our pipeline, +agents leverage the DeepSeek LLM to generate decision tree code by providing +task descriptions, and the agents are further self-reflected using feedback +from the rewards provided by the environment. Based on that, we augment the +generated scripts to fine-tune a small LLM, Qwen2.5-7B-Base, to distill the +decision-making ability via Supervised Fine-Tuning (SFT) and enhance the script +generation ability by the Group Relative Policy Optimization (GRPO) algorithm. +We conduct experiments in the original 23 SMAC tasks and 10 newly-designed +tasks to demonstrate that our method can produce high-quality, interpretable +decision trees with minimal environmental exploration. Moreover, these scripts +exhibit strong transferability, successfully applying to homogeneous SMAC +environments without modification. We believe this approach offers a new +direction for solving decision-making tasks and domain-specific LLM training +pipelines in the future. + +
+
+
+
+
+ + ♻ ☆ What to align in multimodal contrastive learning? ICLR 2025 + + +
+ Humans perceive the world through multisensory integration, blending the +information of different modalities to adapt their behavior. Contrastive +learning offers an appealing solution for multimodal self-supervised learning. +Indeed, by considering each modality as a different view of the same entity, it +learns to align features of different modalities in a shared representation +space. However, this approach is intrinsically limited as it only learns shared +or redundant information between modalities, while multimodal interactions can +arise in other ways. In this work, we introduce CoMM, a Contrastive MultiModal +learning strategy that enables the communication between modalities in a single +multimodal space. Instead of imposing cross- or intra- modality constraints, we +propose to align multimodal representations by maximizing the mutual +information between augmented versions of these multimodal features. Our +theoretical analysis shows that shared, synergistic and unique terms of +information naturally emerge from this formulation, allowing us to estimate +multimodal interactions beyond redundancy. We test CoMM both in a controlled +and in a series of real-world settings: in the former, we demonstrate that CoMM +effectively captures redundant, unique and synergistic information between +modalities. In the latter, CoMM learns complex multimodal interactions and +achieves state-of-the-art results on the seven multimodal benchmarks. Code is +available at https://github.com/Duplums/CoMM + +
+
+ comment: ICLR 2025, 25 pages +
+
+
+
+
+ + ♻ ☆ CycleResearcher: Improving Automated Research via Automated Review ICLR 2025 + + +
+ The automation of scientific discovery has been a long-standing goal within +the research community, driven by the potential to accelerate knowledge +creation. While significant progress has been made using commercial large +language models (LLMs) as research assistants or idea generators, the +possibility of automating the entire research process with open-source LLMs +remains largely unexplored. This paper explores the feasibility of using +open-source post-trained LLMs as autonomous agents capable of performing the +full cycle of automated research and review, from literature review and +manuscript preparation to peer review and paper refinement. Our iterative +preference training framework consists of CycleResearcher, which conducts +research tasks, and CycleReviewer, which simulates the peer review process, +providing iterative feedback via reinforcement learning. To train these models, +we develop two new datasets, Review-5k and Research-14k, reflecting real-world +machine learning research and peer review dynamics. Our results demonstrate +that CycleReviewer achieves promising performance with a 26.89\% reduction in +mean absolute error (MAE) compared to individual human reviewers in predicting +paper scores, indicating the potential of LLMs to effectively assist +expert-level research evaluation. In research, the papers generated by the +CycleResearcher model achieved a score of 5.36 in simulated peer reviews, +showing some competitiveness in terms of simulated review scores compared to +the preprint level of 5.24 from human experts, while still having room for +improvement compared to the accepted paper level of 5.69. This work represents +a significant step toward fully automated scientific inquiry, providing ethical +safeguards and exploring AI-driven research capabilities. The code, dataset and +model weight are released at https://wengsyx.github.io/Researcher/ + +
+
+ comment: Accept in ICLR 2025 +
+
+
+
+
+ + ♻ ☆ DexGraspVLA: A Vision-Language-Action Framework Towards General + Dexterous Grasping + + +
+ Dexterous grasping remains a fundamental yet challenging problem in robotics. +A general-purpose robot must be capable of grasping diverse objects in +arbitrary scenarios. However, existing research typically relies on specific +assumptions, such as single-object settings or limited environments, leading to +constrained generalization. Our solution is DexGraspVLA, a hierarchical +framework that utilizes a pre-trained Vision-Language model as the high-level +task planner and learns a diffusion-based policy as the low-level Action +controller. The key insight lies in iteratively transforming diverse language +and visual inputs into domain-invariant representations, where imitation +learning can be effectively applied due to the alleviation of domain shift. +Thus, it enables robust generalization across a wide range of real-world +scenarios. Notably, our method achieves a 90+% success rate under thousands of +unseen object, lighting, and background combinations in a ``zero-shot'' +environment. Empirical analysis further confirms the consistency of internal +model behavior across environmental variations, thereby validating our design +and explaining its generalization performance. We hope our work can be a step +forward in achieving general dexterous grasping. Our demo and code can be found +at https://dexgraspvla.github.io/. + +
+
+ comment: 21 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ AI Governance through Markets + + +
+ This paper argues that market governance mechanisms should be considered a +key approach in the governance of artificial intelligence (AI), alongside +traditional regulatory frameworks. While current governance approaches have +predominantly focused on regulation, we contend that market-based mechanisms +offer effective incentives for responsible AI development. We examine four +emerging vectors of market governance: insurance, auditing, procurement, and +due diligence, demonstrating how these mechanisms can affirm the relationship +between AI risk and financial risk while addressing capital allocation +inefficiencies. While we do not claim that market forces alone can adequately +protect societal interests, we maintain that standardised AI disclosures and +market mechanisms can create powerful incentives for safe and responsible AI +development. This paper urges regulators, economists, and machine learning +researchers to investigate and implement market-based approaches to AI +governance. + +
+
+
+
+
+ + ♻ ☆ Provable Benefits of Task-Specific Prompts for In-context Learning AISTATS + + +
+ The in-context learning capabilities of modern language models have motivated +a deeper mathematical understanding of sequence models. A line of recent work +has shown that linear attention models can emulate projected gradient descent +iterations to implicitly learn the task vector from the data provided in the +context window. In this work, we consider a novel setting where the global task +distribution can be partitioned into a union of conditional task distributions. +We then examine the use of task-specific prompts and prediction heads for +learning the prior information associated with the conditional task +distribution using a one-layer attention model. Our results on loss landscape +show that task-specific prompts facilitate a covariance-mean decoupling where +prompt-tuning explains the conditional mean of the distribution whereas the +variance is learned/explained through in-context learning. Incorporating +task-specific head further aids this process by entirely decoupling estimation +of mean and variance components. This covariance-mean perspective similarly +explains how jointly training prompt and attention weights can provably help +over fine-tuning after pretraining. + +
+
+ comment: Proceedings of the 28th International Conference on Artificial + Intelligence and Statistics (AISTATS) 2025 +
+
+
+
+
+ + ♻ ☆ MIRROR: A Novel Approach for the Automated Evaluation of Open-Ended + Question Generation NeurIPS'24 + + +
+ Automatic question generation is a critical task that involves evaluating +question quality by considering factors such as engagement, pedagogical value, +and the ability to stimulate critical thinking. These aspects require +human-like understanding and judgment, which automated systems currently lack. +However, human evaluations are costly and impractical for large-scale samples +of generated questions. Therefore, we propose a novel system, MIRROR (Multi-LLM +Iterative Review and Response for Optimized Rating), which leverages large +language models (LLMs) to automate the evaluation process for questions +generated by automated question generation systems. We experimented with +several state-of-the-art LLMs, such as GPT-4, Gemini, and Llama2-70b. We +observed that the scores of human evaluation metrics, namely relevance, +appropriateness, novelty, complexity, and grammaticality, improved when using +the feedback-based approach called MIRROR, tending to be closer to the human +baseline scores. Furthermore, we observed that Pearson's correlation +coefficient between GPT-4 and human experts improved when using our proposed +feedback-based approach, MIRROR, compared to direct prompting for evaluation. +Error analysis shows that our proposed approach, MIRROR, significantly helps to +improve relevance and appropriateness. + +
+
+ comment: NeurIPS'24 Workshop on Large Foundation Models for Educational + Assessment (FM-EduAssess) +
+
+
+
+
+ + ♻ ☆ One-Shot Imitation under Mismatched Execution + + +
+ Human demonstrations as prompts are a powerful way to program robots to do +long-horizon manipulation tasks. However, translating these demonstrations into +robot-executable actions presents significant challenges due to execution +mismatches in movement styles and physical capabilities. Existing methods +either depend on human-robot paired data, which is infeasible to scale, or rely +heavily on frame-level visual similarities that often break down in practice. +To address these challenges, we propose RHyME, a novel framework that +automatically aligns human and robot task executions using optimal transport +costs. Given long-horizon robot demonstrations, RHyME synthesizes semantically +equivalent human videos by retrieving and composing short-horizon human clips. +This approach facilitates effective policy training without the need for paired +data. RHyME successfully imitates a range of cross-embodiment demonstrators, +both in simulation and with a real human hand, achieving over 50\% increase in +task success compared to previous methods. We release our code and datasets at +https://portal-cornell.github.io/rhyme/. + +
+
+
+
+
+ + ♻ ☆ Measuring and identifying factors of individuals' trust in Large + Language Models + + +
+ Large Language Models (LLMs) can engage in human-looking conversational +exchanges. Although conversations can elicit trust between users and LLMs, +scarce empirical research has examined trust formation in human-LLM contexts, +beyond LLMs' trustworthiness or human trust in AI in general. Here, we +introduce the Trust-In-LLMs Index (TILLMI) as a new framework to measure +individuals' trust in LLMs, extending McAllister's cognitive and affective +trust dimensions to LLM-human interactions. We developed TILLMI as a +psychometric scale, prototyped with a novel protocol we called LLM-simulated +validity. The LLM-based scale was then validated in a sample of 1,000 US +respondents. Exploratory Factor Analysis identified a two-factor structure. Two +items were then removed due to redundancy, yielding a final 6-item scale with a +2-factor structure. Confirmatory Factor Analysis on a separate subsample showed +strong model fit ($CFI = .995$, $TLI = .991$, $RMSEA = .046$, $p_{X^2} > .05$). +Convergent validity analysis revealed that trust in LLMs correlated positively +with openness to experience, extraversion, and cognitive flexibility, but +negatively with neuroticism. Based on these findings, we interpreted TILLMI's +factors as "closeness with LLMs" (affective dimension) and "reliance on LLMs" +(cognitive dimension). Younger males exhibited higher closeness with- and +reliance on LLMs compared to older women. Individuals with no direct experience +with LLMs exhibited lower levels of trust compared to LLMs' users. These +findings offer a novel empirical foundation for measuring trust in AI-driven +verbal communication, informing responsible design, and fostering balanced +human-AI collaboration. + +
+
+ comment: 23 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ From Informal to Formal -- Incorporating and Evaluating LLMs on Natural + Language Requirements to Verifiable Formal Proofs + + +
+ The research in AI-based formal mathematical reasoning has shown an +unstoppable growth trend. These studies have excelled in mathematical +competitions like IMO and have made significant progress. This paper focuses on +formal verification, an immediate application scenario of formal reasoning, and +breaks it down into sub-tasks. We constructed 18k high-quality +instruction-response pairs across five formal specification languages (Coq, +Lean4, Dafny, ACSL, and TLA+) by distilling gpt-4o and evaluated against ten +open-sourced LLMs, including recent popular DeepSeek-R1. We also fine-tuned +several 7~8B small models to achieve comparable performance with +Deepseek-R1-671B. Interestingly, we observed that fine-tuning with formal data +also enhances mathematics, reasoning, and coding capabilities. Fine-tuned +models are released at https: //huggingface.co/fm-universe. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ♻ ☆ On the Utility of Equivariance and Symmetry Breaking in Deep Learning + Architectures on Point Clouds + + +
+ This paper explores the key factors that influence the performance of models +working with point clouds, across different tasks of varying geometric +complexity. In this work, we explore the trade-offs between flexibility and +weight-sharing introduced by equivariant layers, assessing when equivariance +boosts or detracts from performance. It is often argued that providing more +information as input improves a model's performance. However, if this +additional information breaks certain properties, such as $\SE(3)$ +equivariance, does it remain beneficial? We identify the key aspects of +equivariant and non-equivariant architectures that drive success in different +tasks by benchmarking them on segmentation, regression, and generation tasks +across multiple datasets with increasing complexity. We observe a positive +impact of equivariance, which becomes more pronounced with increasing task +complexity, even when strict equivariance is not required. + +
+
+ comment: 19 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Rewarding Doubt: A Reinforcement Learning Approach to Confidence + Calibration of Large Language Models + + +
+ A safe and trustworthy use of Large Language Models (LLMs) requires an +accurate expression of confidence in their answers. We introduce a novel +Reinforcement Learning (RL) approach for LLM calibration that fine-tunes LLMs +to elicit calibrated confidence estimations in their answers to factual +questions. We model the problem as a betting game where the model predicts a +confidence score together with every answer, and design a reward function that +penalizes both over and under-confidence. We prove that under our reward design +an optimal policy would result in a perfectly calibrated confidence estimation. +Our experiments demonstrate significantly improved confidence calibration and +generalization to new tasks without re-training, indicating that our approach +teaches a general confidence awareness. This approach enables the training of +inherently calibrated LLMs. + +
+
+
+
+
+ + ♻ ☆ DeePen: Penetration Testing for Audio Deepfake Detection + + +
+ Deepfakes - manipulated or forged audio and video media - pose significant +security risks to individuals, organizations, and society at large. To address +these challenges, machine learning-based classifiers are commonly employed to +detect deepfake content. In this paper, we assess the robustness of such +classifiers through a systematic penetration testing methodology, which we +introduce as DeePen. Our approach operates without prior knowledge of or access +to the target deepfake detection models. Instead, it leverages a set of +carefully selected signal processing modifications - referred to as attacks - +to evaluate model vulnerabilities. Using DeePen, we analyze both real-world +production systems and publicly available academic model checkpoints, +demonstrating that all tested systems exhibit weaknesses and can be reliably +deceived by simple manipulations such as time-stretching or echo addition. +Furthermore, our findings reveal that while some attacks can be mitigated by +retraining detection systems with knowledge of the specific attack, others +remain persistently effective. We release all associated code. + +
+
+
+
+
+ + ♻ ☆ Bringing AI Participation Down to Scale: A Comment on Open AIs + Democratic Inputs to AI Project + + +
+ In 2023, Open AIs Democratic Inputs program funded 10 teams to design +procedures for public participation in generative AI. In this Perspective, we +review the results of the project, drawing on interviews with some of the teams +and our own experiences conducting participation exercises, we identify several +shared yet largely unspoken assumptions of the Democratic Inputs program 1. +that participation must be scalable 2. that the object of participation is a +single model 3. that there must be a single form of participation 4. that the +goal is to extract abstract principles 5. that these principles should have +consensus 6. that publics should be representative and encourage alternative +forms of participation in AI, perhaps not undertaken by tech companies. + +
+
+
+
+
+ + ♻ ☆ Safety Without Semantic Disruptions: Editing-free Safe Image Generation + via Context-preserving Dual Latent Reconstruction + + +
+ Training multimodal generative models on large, uncurated datasets can result +in users being exposed to harmful, unsafe and controversial or +culturally-inappropriate outputs. While model editing has been proposed to +remove or filter undesirable concepts in embedding and latent spaces, it can +inadvertently damage learned manifolds, distorting concepts in close semantic +proximity. We identify limitations in current model editing techniques, showing +that even benign, proximal concepts may become misaligned. To address the need +for safe content generation, we leverage safe embeddings and a modified +diffusion process with tunable weighted summation in the latent space to +generate safer images. Our method preserves global context without compromising +the structural integrity of the learned manifolds. We achieve state-of-the-art +results on safe image generation benchmarks and offer intuitive control over +the level of model safety. We identify trade-offs between safety and +censorship, which presents a necessary perspective in the development of +ethical AI models. We will release our code. + Keywords: Text-to-Image Models, Generative AI, Safety, Reliability, Model +Editing + +
+
+ comment: This research is supported by the NISDRG project #20100007, funded by + the Australian Government +
+
+
+
+
+ + ♻ ☆ LLMs can be Dangerous Reasoners: Analyzing-based Jailbreak Attack on + Large Language Models + + +
+ The rapid development of Large Language Models (LLMs) has brought significant +advancements across various tasks. However, despite these achievements, LLMs +still exhibit inherent safety vulnerabilities, especially when confronted with +jailbreak attacks. Existing jailbreak methods suffer from two main limitations: +reliance on complicated prompt engineering and iterative optimization, which +lead to low attack success rate (ASR) and attack efficiency (AE). In this work, +we propose an efficient jailbreak attack method, Analyzing-based Jailbreak +(ABJ), which leverages the advanced reasoning capability of LLMs to +autonomously generate harmful content, revealing their underlying safety +vulnerabilities during complex reasoning process. We conduct comprehensive +experiments on ABJ across various open-source and closed-source LLMs. In +particular, ABJ achieves high ASR (82.1% on GPT-4o-2024-11-20) with exceptional +AE among all target LLMs, showcasing its remarkable attack effectiveness, +transferability, and efficiency. Our findings underscore the urgent need to +prioritize and improve the safety of LLMs to mitigate the risks of misuse. + +
+
+
+
+
+ + ♻ ☆ Online Scheduling for LLM Inference with KV Cache Constraints + + +
+ Large Language Model (LLM) inference, where a trained model generates text +one word at a time in response to user prompts, is a computationally intensive +process requiring efficient scheduling to optimize latency and resource +utilization. A key challenge in LLM inference is the management of the +Key-Value (KV) cache, which reduces redundant computations but introduces +memory constraints. In this work, we model LLM inference with KV cache +constraints theoretically and propose novel batching and scheduling algorithms +that minimize inference latency while effectively managing the KV cache's +memory. + We analyze both semi-online and fully online scheduling models, and our +results are threefold. First, we provide a polynomial-time algorithm that +achieves exact optimality in terms of average latency in the semi-online prompt +arrival model. Second, in the fully online case with a stochastic prompt +arrival, we introduce an efficient online scheduling algorithm with constant +regret. Third, we prove that no algorithm (deterministic or randomized) can +achieve a constant competitive ratio in fully online adversarial settings. Our +empirical evaluations on a public LLM inference dataset, using the Llama-70B +model on A100 GPUs, show that our approach significantly outperforms benchmark +algorithms used currently in practice, achieving lower latency while reducing +energy consumption. Overall, our results offer a path toward more sustainable +and cost-effective LLM deployment. + +
+
+ comment: Will add a lemma in the proof of Theorem 5.3 to make the statement + and proof more rigorous +
+
+
+
+
+ + ♻ ☆ RIDE: Enhancing Large Language Model Alignment through Restyled + In-Context Learning Demonstration Exemplars + + +
+ Alignment tuning is crucial for ensuring large language models (LLMs) behave +ethically and helpfully. Current alignment approaches require high-quality +annotations and significant training resources. This paper proposes a low-cost, +tuning-free method using in-context learning (ICL) to enhance LLM alignment. +Through an analysis of high-quality ICL demos, we identified style as a key +factor influencing LLM alignment capabilities and explicitly restyled ICL +exemplars based on this stylistic framework. Additionally, we combined the +restyled demos to achieve a balance between the two conflicting aspects of LLM +alignment--factuality and safety. We packaged the restyled examples as prompts +to trigger few-shot learning, improving LLM alignment. Compared to the best +baseline approach, with an average score of 5.00 as the maximum, our method +achieves a maximum 0.10 increase on the Alpaca task (from 4.50 to 4.60), a 0.22 +enhancement on the Just-eval benchmark (from 4.34 to 4.56), and a maximum +improvement of 0.32 (from 3.53 to 3.85) on the MT-Bench dataset. We release the +code and data at https://github.com/AnonymousCode-ComputerScience/RIDE. + +
+
+ comment: 38 pages, 2 figures, 20 tables; The paper is under review in ARR +
+
+
+
+
+ + ♻ ☆ GSplatLoc: Grounding Keypoint Descriptors into 3D Gaussian Splatting for + Improved Visual Localization + + +
+ Although various visual localization approaches exist, such as scene +coordinate regression and camera pose regression, these methods often struggle +with optimization complexity or limited accuracy. To address these challenges, +we explore the use of novel view synthesis techniques, particularly 3D Gaussian +Splatting (3DGS), which enables the compact encoding of both 3D geometry and +scene appearance. We propose a two-stage procedure that integrates dense and +robust keypoint descriptors from the lightweight XFeat feature extractor into +3DGS, enhancing performance in both indoor and outdoor environments. The coarse +pose estimates are directly obtained via 2D-3D correspondences between the 3DGS +representation and query image descriptors. In the second stage, the initial +pose estimate is refined by minimizing the rendering-based photometric warp +loss. Benchmarking on widely used indoor and outdoor datasets demonstrates +improvements over recent neural rendering-based localization methods, such as +NeRFMatch and PNeRFLoc. + +
+
+ comment: Project website at https://gsplatloc.github.io/ +
+
+
+
+
+ + ♻ ☆ Multimodal Action Quality Assessment + + +
+ Action quality assessment (AQA) is to assess how well an action is performed. +Previous works perform modelling by only the use of visual information, +ignoring audio information. We argue that although AQA is highly dependent on +visual information, the audio is useful complementary information for improving +the score regression accuracy, especially for sports with background music, +such as figure skating and rhythmic gymnastics. To leverage multimodal +information for AQA, i.e., RGB, optical flow and audio information, we propose +a Progressive Adaptive Multimodal Fusion Network (PAMFN) that separately models +modality-specific information and mixed-modality information. Our model +consists of with three modality-specific branches that independently explore +modality-specific information and a mixed-modality branch that progressively +aggregates the modality-specific information from the modality-specific +branches. To build the bridge between modality-specific branches and the +mixed-modality branch, three novel modules are proposed. First, a +Modality-specific Feature Decoder module is designed to selectively transfer +modality-specific information to the mixed-modality branch. Second, when +exploring the interaction between modality-specific information, we argue that +using an invariant multimodal fusion policy may lead to suboptimal results, so +as to take the potential diversity in different parts of an action into +consideration. Therefore, an Adaptive Fusion Module is proposed to learn +adaptive multimodal fusion policies in different parts of an action. This +module consists of several FusionNets for exploring different multimodal fusion +strategies and a PolicyNet for deciding which FusionNets are enabled. Third, a +module called Cross-modal Feature Decoder is designed to transfer cross-modal +features generated by Adaptive Fusion Module to the mixed-modality branch. + +
+
+ comment: IEEE Transactions on Image Processing 2024 +
+
+
+
+
+ + ♻ ☆ Handling Spatial-Temporal Data Heterogeneity for Federated Continual + Learning via Tail Anchor CVPR 2025 + + +
+ Federated continual learning (FCL) allows each client to continually update +its knowledge from task streams, enhancing the applicability of federated +learning in real-world scenarios. However, FCL needs to address not only +spatial data heterogeneity between clients but also temporal data heterogeneity +between tasks. In this paper, empirical experiments demonstrate that such +input-level heterogeneity significantly affects the model's internal parameters +and outputs, leading to severe spatial-temporal catastrophic forgetting of +local and previous knowledge. To this end, we propose Federated Tail Anchor +(FedTA) to mix trainable Tail Anchor with the frozen output features to adjust +their position in the feature space, thereby overcoming parameter-forgetting +and output-forgetting. Three novel components are also included: Input +Enhancement for improving the performance of pre-trained models on downstream +tasks; Selective Input Knowledge Fusion for fusion of heterogeneous local +knowledge on the server; and Best Global Prototype Selection for finding the +best anchor point for each class in the feature space. Extensive experiments +demonstrate that FedTA not only outperforms existing FCL methods but also +effectively preserves the relative positions of features. + +
+
+ comment: This paper is accepted by CVPR 2025 +
+
+
+
+
+ + ♻ ☆ Promote, Suppress, Iterate: How Language Models Answer One-to-Many + Factual Queries + + +
+ To answer one-to-many factual queries (e.g., listing cities of a country), a +language model (LM) must simultaneously recall knowledge and avoid repeating +previous answers. How are these two subtasks implemented and integrated +internally? Across multiple datasets and models, we identify a +promote-then-suppress mechanism: the model first recalls all answers, and then +suppresses previously generated ones. Specifically, LMs use both the subject +and previous answer tokens to perform knowledge recall, with attention +propagating subject information and MLPs promoting the answers. Then, attention +attends to and suppresses previous answer tokens, while MLPs amplify the +suppression signal. Our mechanism is corroborated by extensive experimental +evidence: in addition to using early decoding and causal tracing, we analyze +how components use different tokens by introducing both Token Lens, which +decodes aggregated attention updates from specified tokens, and a knockout +method that analyzes changes in MLP outputs after removing attention to +specified tokens. Overall, we provide new insights into how LMs' internal +components interact with different input tokens to support complex factual +recall. Code is available at +https://github.com/Lorenayannnnn/how-lms-answer-one-to-many-factual-queries. + +
+
+
+
+
+ + ♻ ☆ Graph-Aware Isomorphic Attention for Adaptive Dynamics in Transformers + + +
+ We present an approach to modifying Transformer architectures by integrating +graph-aware relational reasoning into the attention mechanism, merging concepts +from graph neural networks and language modeling. Building on the inherent +connection between attention and graph theory, we reformulate the Transformer's +attention mechanism as a graph operation and propose Graph-Aware Isomorphic +Attention. This method leverages advanced graph modeling strategies, including +Graph Isomorphism Networks (GIN) and Principal Neighborhood Aggregation (PNA), +to enrich the representation of relational structures. Our approach captures +complex dependencies and generalizes across tasks, as evidenced by a reduced +generalization gap and improved learning performance. Additionally, we expand +the concept of graph-aware attention to introduce Sparse GIN-Attention, a +fine-tuning approach that employs sparse GINs. By interpreting attention +matrices as sparse adjacency graphs, this technique enhances the adaptability +of pre-trained foundational models with minimal computational overhead, +endowing them with graph-aware capabilities. Sparse GIN-Attention fine-tuning +achieves improved training dynamics and better generalization compared to +alternative methods like low-rank adaption (LoRA). We discuss latent graph-like +structures within traditional attention mechanisms, offering a new lens through +which Transformers can be understood. By evolving Transformers as hierarchical +GIN models for relational reasoning. This perspective suggests profound +implications for foundational model development, enabling the design of +architectures that dynamically adapt to both local and global dependencies. +Applications in bioinformatics, materials science, language modeling, and +beyond could benefit from this synthesis of relational and sequential data +modeling, setting the stage for interpretable and generalizable modeling +strategies. + +
+
+
+
+
+ + ♻ ☆ Sim2Real within 5 Minutes: Efficient Domain Transfer with Stylized + Gaussian Splatting for Endoscopic Images ICRA 2025 + + +
+ Robot assisted endoluminal intervention is an emerging technique for both +benign and malignant luminal lesions. With vision-based navigation, when +combined with pre-operative imaging data as priors, it is possible to recover +position and pose of the endoscope without the need of additional sensors. In +practice, however, aligning pre-operative and intra-operative domains is +complicated by significant texture differences. Although methods such as style +transfer can be used to address this issue, they require large datasets from +both source and target domains with prolonged training times. This paper +proposes an efficient domain transfer method based on stylized Gaussian +splatting, only requiring a few of real images (10 images) with very fast +training time. Specifically, the transfer process includes two phases. In the +first phase, the 3D models reconstructed from CT scans are represented as +differential Gaussian point clouds. In the second phase, only color appearance +related parameters are optimized to transfer the style and preserve the visual +content. A novel structure consistency loss is applied to latent features and +depth levels to enhance the stability of the transferred images. Detailed +validation was performed to demonstrate the performance advantages of the +proposed method compared to that of the current state-of-the-art, highlighting +the potential for intra-operative surgical navigation. + +
+
+ comment: Accepted by ICRA 2025 +
+
+
+
+
+ + ♻ ☆ A Physical Coherence Benchmark for Evaluating Video Generation Models + via Optical Flow-guided Frame Prediction + + +
+ Recent advances in video generation models demonstrate their potential as +world simulators, but they often struggle with videos deviating from physical +laws, a key concern overlooked by most text-to-video benchmarks. We introduce a +benchmark designed specifically to assess the Physical Coherence of generated +videos, PhyCoBench. Our benchmark includes 120 prompts covering 7 categories of +physical principles, capturing key physical laws observable in video content. +We evaluated four state-of-the-art (SoTA) T2V models on PhyCoBench and +conducted manual assessments. Additionally, we propose an automated evaluation +model: PhyCoPredictor, a diffusion model that generates optical flow and video +frames in a cascade manner. Through a consistency evaluation comparing +automated and manual sorting, the experimental results show that PhyCoPredictor +currently aligns most closely with human evaluation. Therefore, it can +effectively evaluate the physical coherence of videos, providing insights for +future model optimization. Our benchmark, including physical coherence prompts, +the automatic evaluation tool PhyCoPredictor, and the generated video dataset, +has been released on GitHub at https://github.com/Jeckinchen/PhyCoBench. + +
+
+
+
+
+ + ♻ ☆ A Survey on LLM Test-Time Compute via Search: Tasks, LLM Profiling, + Search Algorithms, and Relevant Frameworks + + +
+ LLM test-time compute (or LLM inference) via search has emerged as a +promising research area with rapid developments. However, current frameworks +often adopt distinct perspectives on three key aspects (task definition, LLM +profiling, and search procedures), making direct comparisons challenging. +Moreover, the search algorithms employed often diverge from standard +implementations, and their specific characteristics are not thoroughly +specified. In this survey, we provide a comprehensive technical review that +unifies task definitions and provides modular definitions of LLM profiling and +search procedures. The definitions enable precise comparisons of various LLM +inference frameworks while highlighting their departures from conventional +search algorithms. We also discuss the applicability, performance, and +efficiency of these methods. We have updated our content to include the latest +papers, and the differences between versions are highlighted in the appendix. +For further details and ongoing updates, please refer to our GitHub repository: +https://github.com/xinzhel/LLM-Agent-Survey/blob/main/search.md + +
+
+
+
+
+ + ♻ ☆ LADDER: Self-Improving LLMs Through Recursive Problem Decomposition + + +
+ We introduce LADDER (Learning through Autonomous Difficulty-Driven Example +Recursion), a framework which enables Large Language Models to autonomously +improve their problem-solving capabilities through self-guided learning by +recursively generating and solving progressively simpler variants of complex +problems. Unlike prior approaches that require curated datasets or human +feedback, LADDER leverages a model's own capabilities to generate easier +question variants. We demonstrate LADDER's effectiveness in the subject of +mathematical integration, improving Llama 3.2 3B's accuracy from 1% to 82% on +undergraduate-level problems and enabling Qwen2.5 7B Deepseek-R1 Distilled to +achieve 73% on the MIT Integration Bee qualifying examination. We also +introduce TTRL (Test-Time Reinforcement Learning), where we perform +reinforcement learning on variants of test problems at inference time. TTRL +enables Qwen2.5 7B Deepseek-R1 Distilled to achieve a state-of-the-art score of +90% on the MIT Integration Bee qualifying examination, surpassing OpenAI o1's +performance. These results show how self-directed strategic learning can +achieve significant capability improvements without relying on architectural +scaling or human supervision. + +
+
+
+
+
+ + ♻ ☆ ChaI-TeA: A Benchmark for Evaluating Autocompletion of Interactions with + LLM-based Chatbots + + +
+ The rise of LLMs has deflected a growing portion of human-computer +interactions towards LLM-based chatbots. The remarkable abilities of these +models allow users to interact using long, diverse natural language text +covering a wide range of topics and styles. Phrasing these messages is a time +and effort consuming task, calling for an autocomplete solution to assist +users. We introduce the task of chatbot interaction autocomplete. We present +ChaI-TeA: CHat InTEraction Autocomplete; An autcomplete evaluation framework +for LLM-based chatbot interactions. The framework includes a formal definition +of the task, coupled with suitable datasets and metrics. We use the framework +to evaluate After formally defining the task along with suitable datasets and +metrics, we test 9 models on the defined auto completion task, finding that +while current off-the-shelf models perform fairly, there is still much room for +improvement, mainly in ranking of the generated suggestions. We provide +insights for practitioners working on this task and open new research +directions for researchers in the field. We release our framework to serve as a +foundation for future research. + +
+
+
+
+
+ + ♻ ☆ AIArena: A Blockchain-Based Decentralized AI Training Platform WWW + + +
+ The rapid advancement of AI has underscored critical challenges in its +development and implementation, largely due to centralized control by a few +major corporations. This concentration of power intensifies biases within AI +models, resulting from inadequate governance and oversight mechanisms. +Additionally, it limits public involvement and heightens concerns about the +integrity of model generation. Such monopolistic control over data and AI +outputs threatens both innovation and fair data usage, as users inadvertently +contribute data that primarily benefits these corporations. In this work, we +propose AIArena, a blockchain-based decentralized AI training platform designed +to democratize AI development and alignment through on-chain incentive +mechanisms. AIArena fosters an open and collaborative environment where +participants can contribute models and computing resources. Its on-chain +consensus mechanism ensures fair rewards for participants based on their +contributions. We instantiate and implement AIArena on the public Base +blockchain Sepolia testnet, and the evaluation results demonstrate the +feasibility of AIArena in real-world applications. + +
+
+ comment: Camera ready version. Accepted by the ACM Web Conference (WWW), 2025 +
+
+
+
+
+ + ♻ ☆ Schedule On the Fly: Diffusion Time Prediction for Faster and Better + Image Generation + + +
+ Diffusion and flow matching models have achieved remarkable success in +text-to-image generation. However, these models typically rely on the +predetermined denoising schedules for all prompts. The multi-step reverse +diffusion process can be regarded as a kind of chain-of-thought for generating +high-quality images step by step. Therefore, diffusion models should reason for +each instance to adaptively determine the optimal noise schedule, achieving +high generation quality with sampling efficiency. In this paper, we introduce +the Time Prediction Diffusion Model (TPDM) for this. TPDM employs a +plug-and-play Time Prediction Module (TPM) that predicts the next noise level +based on current latent features at each denoising step. We train the TPM using +reinforcement learning to maximize a reward that encourages high final image +quality while penalizing excessive denoising steps. With such an adaptive +scheduler, TPDM not only generates high-quality images that are aligned closely +with human preferences but also adjusts diffusion time and the number of +denoising steps on the fly, enhancing both performance and efficiency. With +Stable Diffusion 3 Medium architecture, TPDM achieves an aesthetic score of +5.44 and a human preference score (HPS) of 29.59, while using around 50% fewer +denoising steps to achieve better performance. + +
+
+
+
+
+ + ♻ ☆ Prompt-Matcher: Leveraging Large Models to Reduce Uncertainty in Schema + Matching Results + + +
+ Schema matching is the process of identifying correspondences between the +elements of two given schemata, essential for database management systems, data +integration, and data warehousing. For datasets across different scenarios, the +optimal schema matching algorithm is different. For single algorithm, +hyperparameter tuning also cases multiple results. All results assigned equal +probabilities are stored in probabilistic databases to facilitate uncertainty +management. The substantial degree of uncertainty diminishes the efficiency and +reliability of data processing, thereby precluding the provision of more +accurate information for decision-makers. To address this problem, we introduce +a new approach based on fine-grained correspondence verification with specific +prompt of Large Language Model. + Our approach is an iterative loop that consists of three main components: (1) +the correspondence selection algorithm, (2) correspondence verification, and +(3) the update of probability distribution. The core idea is that +correspondences intersect across multiple results, thereby linking the +verification of correspondences to the reduction of uncertainty in candidate +results. + The task of selecting an optimal correspondence set to maximize the +anticipated uncertainty reduction within a fixed budgetary framework is +established as an NP-hard problem. We propose a novel $(1-1/e)$-approximation +algorithm that significantly outperforms brute algorithm in terms of +computational efficiency. To enhance correspondence verification, we have +developed two prompt templates that enable GPT-4 to achieve state-of-the-art +performance across two established benchmark datasets. Our comprehensive +experimental evaluation demonstrates the superior effectiveness and robustness +of the proposed approach. + +
+
+
+
+
+ + ♻ ☆ Explaining Vision-Language Similarities in Dual Encoders with + Feature-Pair Attributions + + +
+ Dual encoder architectures like CLIP models map two types of inputs into a +shared embedding space and predict similarities between them. Despite their +success, it is, however, not understood how these models compare their two +inputs. Common first-order feature-attribution methods can only provide limited +insights into dual-encoders since their predictions depend on +feature-interactions rather than on individual features. In this paper, we +first derive a second-order method enabling the attribution of predictions by +any differentiable dual encoder onto feature-interactions between its inputs. +Second, we apply our method to CLIP models and show that they learn +fine-grained correspondences between parts of captions and regions in images. +They match objects across input modes also account for mismatches. This +visual-linguistic grounding ability, however, varies heavily between object +classes and exhibits pronounced out-of-domain effects. We can identify +individual errors as well as systematic failure categories including object +coverage, unusual scenes and correlated contexts. + +
+
+
+
+
+ + ♻ ☆ Bounding Evidence and Estimating Log-Likelihood in VAE AISTATS 2023 + + +
+ Many crucial problems in deep learning and statistical inference are caused +by a variational gap, i.e., a difference between model evidence +(log-likelihood) and evidence lower bound (ELBO). In particular, in a classical +VAE setting that involves training via an ELBO cost function, it is difficult +to provide a robust comparison of the effects of training between models, since +we do not know a log-likelihood of data (but only its lower bound). In this +paper, to deal with this problem, we introduce a general and effective upper +bound, which allows us to efficiently approximate the evidence of data. We +provide extensive theoretical and experimental studies of our approach, +including its comparison to the other state-of-the-art upper bounds, as well as +its application as a tool for the evaluation of models that were trained on +various lower bounds. + +
+
+ comment: Paper accepted for AISTATS 2023 +
+
+
+
+
+ + ♻ ☆ TAG: A Decentralized Framework for Multi-Agent Hierarchical + Reinforcement Learning + + +
+ Hierarchical organization is fundamental to biological systems and human +societies, yet artificial intelligence systems often rely on monolithic +architectures that limit adaptability and scalability. Current hierarchical +reinforcement learning (HRL) approaches typically restrict hierarchies to two +levels or require centralized training, which limits their practical +applicability. We introduce TAME Agent Framework (TAG), a framework for +constructing fully decentralized hierarchical multi-agent systems. TAG enables +hierarchies of arbitrary depth through a novel LevelEnv concept, which +abstracts each hierarchy level as the environment for the agents above it. This +approach standardizes information flow between levels while preserving loose +coupling, allowing for seamless integration of diverse agent types. We +demonstrate the effectiveness of TAG by implementing hierarchical architectures +that combine different RL agents across multiple levels, achieving improved +performance over classical multi-agent RL baselines on standard benchmarks. Our +results show that decentralized hierarchical organization enhances both +learning speed and final performance, positioning TAG as a promising direction +for scalable multi-agent systems. + +
+
+
+
+
+ + ♻ ☆ Exploration Implies Data Augmentation: Reachability and Generalisation + in Contextual MDPs + + +
+ In the zero-shot policy transfer (ZSPT) setting for contextual Markov +decision processes (MDP), agents train on a fixed set of contexts and must +generalise to new ones. Recent work has argued and demonstrated that increased +exploration can improve this generalisation, by training on more states in the +training contexts. In this paper, we demonstrate that training on more states +can indeed improve generalisation, but can come at a cost of reducing the +accuracy of the learned value function which should not benefit generalisation. +We introduce reachability in the ZSPT setting to define which states/contexts +require generalisation and explain why exploration can improve it. We +hypothesise and demonstrate that using exploration to increase the agent's +coverage while also increasing the accuracy improves generalisation even more. +Inspired by this, we propose a method Explore-Go that implements an exploration +phase at the beginning of each episode, which can be combined with existing on- +and off-policy RL algorithms and significantly improves generalisation even in +partially observable MDPs. We demonstrate the effectiveness of Explore-Go when +combined with several popular algorithms and show an increase in generalisation +performance across several environments. With this, we hope to provide +practitioners with a simple modification that can improve the generalisation of +their agents. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2406.08069 +
+
+
+
+
+ + ♻ ☆ Coordinated Multi-Armed Bandits for Improved Spatial Reuse in Wi-Fi + + +
+ Multi-Access Point Coordination (MAPC) and Artificial Intelligence and +Machine Learning (AI/ML) are expected to be key features in future Wi-Fi, such +as the forthcoming IEEE 802.11bn (Wi-Fi~8) and beyond. In this paper, we +explore a coordinated solution based on online learning to drive the +optimization of Spatial Reuse (SR), a method that allows multiple devices to +perform simultaneous transmissions by controlling interference through Packet +Detect (PD) adjustment and transmit power control. In particular, we focus on a +Multi-Agent Multi-Armed Bandit (MA-MAB) setting, where multiple decision-making +agents concurrently configure SR parameters from coexisting networks by +leveraging the MAPC framework, and study various algorithms and reward-sharing +mechanisms. We evaluate different MA-MAB implementations using Komondor, a +well-adopted Wi-Fi simulator, and demonstrate that AI-native SR enabled by +coordinated MABs can improve the network performance over current Wi-Fi +operation: mean throughput increases by 15%, fairness is improved by increasing +the minimum throughput across the network by 210%, while the maximum access +delay is kept below 3 ms. + +
+
+
+
+
+ + ♻ ☆ XLSTM-HVED: Cross-Modal Brain Tumor Segmentation and MRI Reconstruction + Method Using Vision XLSTM and Heteromodal Variational Encoder-Decoder + + +
+ Neurogliomas are among the most aggressive forms of cancer, presenting +considerable challenges in both treatment and monitoring due to their +unpredictable biological behavior. Magnetic resonance imaging (MRI) is +currently the preferred method for diagnosing and monitoring gliomas. However, +the lack of specific imaging techniques often compromises the accuracy of tumor +segmentation during the imaging process. To address this issue, we introduce +the XLSTM-HVED model. This model integrates a hetero-modal encoder-decoder +framework with the Vision XLSTM module to reconstruct missing MRI modalities. +By deeply fusing spatial and temporal features, it enhances tumor segmentation +performance. The key innovation of our approach is the Self-Attention +Variational Encoder (SAVE) module, which improves the integration of modal +features. Additionally, it optimizes the interaction of features between +segmentation and reconstruction tasks through the Squeeze-Fusion-Excitation +Cross Awareness (SFECA) module. Our experiments using the BraTS 2024 dataset +demonstrate that our model significantly outperforms existing advanced methods +in handling cases where modalities are missing. Our source code is available at +https://github.com/Quanato607/XLSTM-HVED. + +
+
+ comment: 5 pages, 2 figures +
+
+
+
+
+ + ♻ A Survey on Self-play Methods in Reinforcement Learning + + +
+ Self-play, characterized by agents' interactions with copies or past versions +of themselves, has recently gained prominence in reinforcement learning (RL). +This paper first clarifies the preliminaries of self-play, including the +multi-agent reinforcement learning framework and basic game theory concepts. +Then, it provides a unified framework and classifies existing self-play +algorithms within this framework. Moreover, the paper bridges the gap between +the algorithms and their practical implications by illustrating the role of +self-play in different scenarios. Finally, the survey highlights open +challenges and future research directions in self-play. This paper is an +essential guide map for understanding the multifaceted landscape of self-play +in RL. + +
+
+
+
+
+ + ♻ ☆ Bi-Fact: A Bidirectional Factorization-based Evaluation of Intent + Extraction from UI Trajectories + + +
+ Evaluating intent extraction from GUIs demands accurate, fine-grained +metrics. This paper introduces Bi-Fact, a novel method that decomposes intents +into atomic facts and performs bidirectional comparisons to assess precision +and recall. Experiments demonstrate Bi-Fact's superior correlation with human +judgments compared to existing metrics, establishing a more robust evaluation +framework for UI-driven intent understanding. + +
+
+
+
+
+ + ♻ ☆ Number Cookbook: Number Understanding of Language Models and How to + Improve It ICLR 2025 + + +
+ Large language models (LLMs) can solve an increasing number of complex +reasoning tasks while making surprising mistakes in basic numerical +understanding and processing (such as 9.11 > 9.9). The latter ability is +essential for tackling complex arithmetic and mathematical problems and serves +as a foundation for most reasoning tasks, but previous work paid little +attention to it or only discussed several restricted tasks (like integer +addition). In this paper, we comprehensively investigate the numerical +understanding and processing ability (NUPA) of LLMs. Firstly, we introduce a +benchmark covering four common numerical representations and 17 distinct +numerical tasks in four major categories, resulting in 41 meaningful +combinations in total. These tasks are derived from primary and secondary +education curricula, encompassing nearly all everyday numerical understanding +and processing scenarios, and the rules of these tasks are very simple and +clear. Through the benchmark, we find that current LLMs fail frequently in many +of the tasks. To study the problem, we train small models with existing and +potential techniques for enhancing NUPA (such as tokenizers, PEs, and number +formats), comprehensively evaluating their effectiveness using our testbed. We +also finetune practical-scale LLMs on our proposed NUPA tasks and find that 1) +naive finetuning can improve NUPA a lot on many but not all tasks, and 2) +surprisingly, techniques designed to enhance NUPA prove ineffective for +finetuning pretrained models. We further explore the impact of chain-of-thought +techniques on NUPA. Our work provides a more detailed and comprehensive +understanding of NUPA in LLMs. Our benchmark and code are released at +https://github.com/GraphPKU/number_cookbook. + +
+
+ comment: ICLR 2025 poster +
+
+
+
+
+ + ♻ ☆ Affordably Fine-tuned LLMs Provide Better Answers to Course-specific + MCQs + + +
+ In education, the capability of generating human-like text of Large Language +Models (LLMs) inspired work on how they can increase the efficiency of learning +and teaching. We study the affordability of these models for educators and +students by investigating how LLMs answer multiple-choice questions (MCQs) with +respect to hardware constraints and refinement techniques. We explore this +space by using generic pre-trained LLMs (the 7B, 13B, and 70B variants of +LLaMA-2) to answer 162 undergraduate-level MCQs from a course on Programming +Languages (PL) -- the MCQ dataset is a contribution of this work, which we make +publicly available. Specifically, we dissect how different factors, such as +using readily-available material -- (parts of) the course's textbook -- for +fine-tuning and quantisation (to decrease resource usage) can change the +accuracy of the responses. The main takeaway is that smaller textbook-based +fine-tuned models outperform generic larger ones (whose pre-training requires +conspicuous resources), making the usage of LLMs for answering MCQs resource- +and material-wise affordable. + +
+
+ comment: The 40th ACM/SIGAPP Symposium On Applied Computing +
+
+
+
+
+ + ♻ ☆ Iterative Value Function Optimization for Guided Decoding + + +
+ While Reinforcement Learning from Human Feedback (RLHF) has become the +predominant method for controlling language model outputs, it suffers from high +computational costs and training instability. Guided decoding, especially +value-guided methods, offers a cost-effective alternative by controlling +outputs without re-training models. However, the accuracy of the value function +is crucial for value-guided decoding, as inaccuracies can lead to suboptimal +decision-making and degraded performance. Existing methods struggle with +accurately estimating the optimal value function, leading to less effective +control. We propose Iterative Value Function Optimization, a novel framework +that addresses these limitations through two key components: Monte Carlo Value +Estimation, which reduces estimation variance by exploring diverse +trajectories, and Iterative On-Policy Optimization, which progressively +improves value estimation through collecting trajectories from value-guided +policies. Extensive experiments on text summarization, multi-turn dialogue, and +instruction following demonstrate the effectiveness of value-guided decoding +approaches in aligning language models. These approaches not only achieve +alignment but also significantly reduce computational costs by leveraging +principled value function optimization for efficient and effective control. + +
+
+ comment: 20 pages, 10 figures +
+
+
+
+
+ + ♻ SLTNet: Efficient Event-based Semantic Segmentation with Spike-driven + Lightweight Transformer-based Networks IROS 2025 + + +
+ Event-based semantic segmentation has great potential in autonomous driving +and robotics due to the advantages of event cameras, such as high dynamic +range, low latency, and low power cost. Unfortunately, current artificial +neural network (ANN)-based segmentation methods suffer from high computational +demands, the requirements for image frames, and massive energy consumption, +limiting their efficiency and application on resource-constrained edge/mobile +platforms. To address these problems, we introduce SLTNet, a spike-driven +lightweight transformer-based network designed for event-based semantic +segmentation. Specifically, SLTNet is built on efficient spike-driven +convolution blocks (SCBs) to extract rich semantic features while reducing the +model's parameters. Then, to enhance the long-range contextural feature +interaction, we propose novel spike-driven transformer blocks (STBs) with +binary mask operations. Based on these basic blocks, SLTNet employs a +high-efficiency single-branch architecture while maintaining the low energy +consumption of the Spiking Neural Network (SNN). Finally, extensive experiments +on DDD17 and DSEC-Semantic datasets demonstrate that SLTNet outperforms +state-of-the-art (SOTA) SNN-based methods by at most 9.06% and 9.39% mIoU, +respectively, with extremely 4.58x lower energy consumption and 114 FPS +inference speed. Our code is open-sourced and available at +https://github.com/longxianlei/SLTNet-v1.0. + +
+
+ comment: Submitted to 2025 IEEE/RSJ International Conference on Intelligent + Robots and Systems (IROS 2025) +
+
+
+
+
+ + ♻ LaRA: Benchmarking Retrieval-Augmented Generation and Long-Context LLMs + -- No Silver Bullet for LC or RAG Routing + + +
+ Effectively incorporating external knowledge into Large Language Models +(LLMs) is crucial for enhancing their capabilities and addressing real-world +needs. Retrieval-Augmented Generation (RAG) offers an effective method for +achieving this by retrieving the most relevant fragments into LLMs. However, +the advancements in context window size for LLMs offer an alternative approach, +raising the question of whether RAG remains necessary for effectively handling +external knowledge. Several existing studies provide inconclusive comparisons +between RAG and long-context (LC) LLMs, largely due to limitations in the +benchmark designs. In this paper, we present LaRA, a novel benchmark +specifically designed to rigorously compare RAG and LC LLMs. LaRA encompasses +2326 test cases across four practical QA task categories and three types of +naturally occurring long texts. Through systematic evaluation of seven +open-source and four proprietary LLMs, we find that the optimal choice between +RAG and LC depends on a complex interplay of factors, including the model's +parameter size, long-text capabilities, context length, task type, and the +characteristics of the retrieved chunks. Our findings provide actionable +guidelines for practitioners to effectively leverage both RAG and LC approaches +in developing and deploying LLM applications. Our code and dataset is provided +at: +\href{https://github.com/Alibaba-NLP/LaRA}{\textbf{https://github.com/Alibaba-NLP/LaRA}}. + +
+
+ comment: 22 pages +
+
+
+
+
+ + ♻ ☆ Scale-Invariant Object Detection by Adaptive Convolution with Unified + Global-Local Context + + +
+ Dense features are important for detecting minute objects in images. +Unfortunately, despite the remarkable efficacy of the CNN models in multi-scale +object detection, CNN models often fail to detect smaller objects in images due +to the loss of dense features during the pooling process. Atrous convolution +addresses this issue by applying sparse kernels. However, sparse kernels often +can lose the multi-scale detection efficacy of the CNN model. In this paper, we +propose an object detection model using a Switchable (adaptive) Atrous +Convolutional Network (SAC-Net) based on the efficientDet model. A fixed atrous +rate limits the performance of the CNN models in the convolutional layers. To +overcome this limitation, we introduce a switchable mechanism that allows for +dynamically adjusting the atrous rate during the forward pass. The proposed +SAC-Net encapsulates the benefits of both low-level and high-level features to +achieve improved performance on multi-scale object detection tasks, without +losing the dense features. Further, we apply a depth-wise switchable atrous +rate to the proposed network, to improve the scale-invariant features. Finally, +we apply global context on the proposed model. Our extensive experiments on +benchmark datasets demonstrate that the proposed SAC-Net outperforms the +state-of-the-art models by a significant margin in terms of accuracy. + +
+
+
+
+
+ + ♻ ☆ Improved Performances and Motivation in Intelligent Tutoring Systems: + Combining Machine Learning and Learner Choice + + +
+ Large class sizes challenge personalized learning in schools, prompting the +use of educational technologies such as intelligent tutoring systems. To +address this, we present an AI-driven personalization system, called ZPDES, +based on the Learning Progress Hypothesis - modeling curiosity-driven learning +- and multi-armed bandit techniques. It sequences exercises that maximize +learning progress for each student. While previous studies demonstrated its +efficacy in enhancing learning compared to hand-made curricula, its impact on +student motivation remained unexplored. Furthermore, ZPDES previously lacked +features allowing student choice, a limitation in agency that conflicts with +its foundation on models of curiosity-driven learning. This study investigates +how integrating choice, as a gamification element unrelated to exercise +difficulty, affects both learning outcomes and motivation. We conducted an +extensive field study (265 7-8 years old children, RCT design), comparing ZPDES +with and without choice against a hand-designed curriculum. Results show that +ZPDES improves both learning performance and the learning experience. Moreover +adding choice to ZPDES enhances intrinsic motivation and further strengthens +its learning benefits. In contrast, incorporating choice into a fixed, linear +curriculum negatively impacts learning outcomes. These findings highlight that +the intrinsic motivation elicited by choice (gamification) is beneficial only +when paired with an adaptive personalized learning system. This insight is +critical as gamified features become increasingly prevalent in educational +technologies. + +
+
+
+
+
+ + ♻ ☆ Grams: Gradient Descent with Adaptive Momentum Scaling SC + + +
+ We introduce $\mathbf{G}$radient Descent with $\mathbf{A}$daptive +$\mathbf{M}$omentum $\mathbf{S}$caling ($\mathbf{Grams}$), a novel optimization +algorithm that decouples the direction and magnitude of parameter updates in +deep learning. Unlike traditional optimizers that directly integrate momentum +into updates, Grams separates the update direction, derived from current +gradients, from momentum, which is used solely for adaptive magnitude scaling. +This approach enables Grams to achieve improved loss descent compared to +state-of-the-art cautious and momentum-based optimizers. We theoretically +demonstrate that Grams descents faster than other state-of-the-art optimizers +and establish a global convergence guarantee for Grams. We also validate its +effectiveness through extensive empirical evaluations. The results demonstrate +Grams' superior performance, including faster convergence and better +generalization, compared to widely-used optimizers such as Adam, Lion, and +their cautious variants. Our results highlight Grams' potential as a +transformative approach for efficiently training and fine-tuning large language +models. Code is available at https://github.com/Gunale0926/Grams. + +
+
+ comment: SCOPE Workshop @ ICLR 2025 +
+
+
+
+
+ + ♻ ☆ TimeRefine: Temporal Grounding with Time Refining Video LLM + + +
+ Video temporal grounding aims to localize relevant temporal boundaries in a +video given a textual prompt. Recent work has focused on enabling Video LLMs to +perform video temporal grounding via next-token prediction of temporal +timestamps. However, accurately localizing timestamps in videos remains +challenging for Video LLMs when relying solely on temporal token prediction. +Our proposed TimeRefine addresses this challenge in two ways. First, instead of +directly predicting the start and end timestamps, we reformulate the temporal +grounding task as a temporal refining task: the model first makes rough +predictions and then refines them by predicting offsets to the target segment. +This refining process is repeated multiple times, through which the model +progressively self-improves its temporal localization accuracy. Second, to +enhance the model's temporal perception capabilities, we incorporate an +auxiliary prediction head that penalizes the model more if a predicted segment +deviates further from the ground truth, thus encouraging the model to make +closer and more accurate predictions. Our plug-and-play method can be +integrated into most LLM-based temporal grounding approaches. The experimental +results demonstrate that TimeRefine achieves 3.6% and 5.0% mIoU improvements on +the ActivityNet and Charades-STA datasets, respectively. Code and pretrained +models will be released. + +
+
+
+
+
+ + ♻ ☆ Revisiting Random Walks for Learning on Graphs + + +
+ We revisit a simple model class for machine learning on graphs, where a +random walk on a graph produces a machine-readable record, and this record is +processed by a deep neural network to directly make vertex-level or graph-level +predictions. We call these stochastic machines random walk neural networks +(RWNNs), and through principled analysis, show that we can design them to be +isomorphism invariant while capable of universal approximation of graph +functions in probability. A useful finding is that almost any kind of record of +random walks guarantees probabilistic invariance as long as the vertices are +anonymized. This enables us, for example, to record random walks in plain text +and adopt a language model to read these text records to solve graph tasks. We +further establish a parallelism to message passing neural networks using tools +from Markov chain theory, and show that over-smoothing in message passing is +alleviated by construction in RWNNs, while over-squashing manifests as +probabilistic under-reaching. We empirically demonstrate RWNNs on a range of +problems, verifying our theoretical analysis and demonstrating the use of +language models for separating strongly regular graphs where 3-WL test fails, +and transductive classification on arXiv citation network. Code is available at +https://github.com/jw9730/random-walk. + +
+
+ comment: 51 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Training a Generally Curious Agent + + +
+ Efficient exploration is essential for intelligent systems interacting with +their environment, but existing language models often fall short in scenarios +that require strategic information gathering. In this paper, we present +PAPRIKA, a fine-tuning approach that enables language models to develop general +decision-making capabilities that are not confined to particular environments. +By training on synthetic interaction data from different tasks that require +diverse strategies, PAPRIKA teaches models to explore and adapt their behavior +on a new task based on environment feedback in-context without more gradient +updates. Experimental results show that models fine-tuned with PAPRIKA can +effectively transfer their learned decision-making capabilities to entirely +unseen tasks without additional training. Unlike traditional training, our +approach's primary bottleneck lies in sampling useful interaction data instead +of model updates. To improve sample efficiency, we propose a curriculum +learning strategy that prioritizes sampling trajectories from tasks with high +learning potential. These results suggest a promising path towards AI systems +that can autonomously solve novel sequential decision-making problems that +require interactions with the external world. + +
+
+ comment: Project Website: https://paprika-llm.github.io +
+
+
+
+
+ + ♻ ☆ Affordance-Guided Reinforcement Learning via Visual Prompting + + +
+ Robots equipped with reinforcement learning (RL) have the potential to learn +a wide range of skills solely from a reward signal. However, obtaining a robust +and dense reward signal for general manipulation tasks remains a challenge. +Existing learning-based approaches require significant data, such as human +demonstrations of success and failure, to learn task-specific reward functions. +Recently, there is also a growing adoption of large multi-modal foundation +models for robotics that can perform visual reasoning in physical contexts and +generate coarse robot motions for manipulation tasks. Motivated by this range +of capability, in this work, we present Keypoint-based Affordance Guidance for +Improvements (KAGI), a method leveraging rewards shaped by vision-language +models (VLMs) for autonomous RL. State-of-the-art VLMs have demonstrated +impressive reasoning about affordances through keypoints in zero-shot, and we +use these to define dense rewards that guide autonomous robotic learning. On +real-world manipulation tasks specified by natural language descriptions, KAGI +improves the sample efficiency of autonomous RL and enables successful task +completion in 30K online fine-tuning steps. Additionally, we demonstrate the +robustness of KAGI to reductions in the number of in-domain demonstrations used +for pre-training, reaching similar performance in 45K online fine-tuning steps. +Project website: https://sites.google.com/view/affordance-guided-rl + +
+
+ comment: 8 pages, 6 figures. Robotics: Science and Systems (RSS) 2024, Task + Specification for General-Purpose Intelligent Robots & Lifelong Robot + Learning Workshops +
+
+
+
+
+ + ♻ ☆ VerilogCoder: Autonomous Verilog Coding Agents with Graph-based Planning + and Abstract Syntax Tree (AST)-based Waveform Tracing Tool AAAI 2025 + + +
+ Due to the growing complexity of modern Integrated Circuits (ICs), automating +hardware design can prevent a significant amount of human error from the +engineering process and result in less errors. Verilog is a popular hardware +description language for designing and modeling digital systems; thus, Verilog +generation is one of the emerging areas of research to facilitate the design +process. In this work, we propose VerilogCoder, a system of multiple Artificial +Intelligence (AI) agents for Verilog code generation, to autonomously write +Verilog code and fix syntax and functional errors using collaborative Verilog +tools (i.e., syntax checker, simulator, and waveform tracer). Firstly, we +propose a task planner that utilizes a novel Task and Circuit Relation Graph +retrieval method to construct a holistic plan based on module descriptions. To +debug and fix functional errors, we develop a novel and efficient abstract +syntax tree (AST)-based waveform tracing tool, which is integrated within the +autonomous Verilog completion flow. The proposed methodology successfully +generates 94.2% syntactically and functionally correct Verilog code, surpassing +the state-of-the-art methods by 33.9% on the VerilogEval-Human v2 benchmark. + +
+
+ comment: main paper 7 pages, reference 1 page, it is the version that accepted + by AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Unsupervised Topic Models are Data Mixers for Pre-training Language + Models + + +
+ The performance of large language models (LLMs) is significantly affected by +the quality and composition of their pre-training data, which is inherently +diverse, spanning various domains, sources, and topics. Effectively integrating +these heterogeneous data sources is crucial for optimizing LLM performance. +Previous research has predominantly concentrated on domain-based data mixing, +often neglecting the nuanced topic-level characteristics of the data. To +address this gap, we propose a simple yet effective topic-based data mixing +strategy that utilizes fine-grained topics generated through our topic modeling +method, DataWeave. DataWeave employs a multi-stage clustering process to group +semantically similar documents and utilizes LLMs to generate detailed topics, +thereby facilitating a more nuanced understanding of dataset composition. Our +strategy employs heuristic methods to upsample or downsample specific topics, +which significantly enhances LLM performance on downstream tasks, achieving +superior results compared to previous, more complex data mixing approaches. +Furthermore, we confirm that the topics Science and Relationships are +particularly effective, yielding the most substantial performance improvements. +We will make our code and datasets publicly available. + +
+
+ comment: 18 pages,7 figures +
+
+
+
+
+ + ♻ ☆ Weaker LLMs' Opinions Also Matter: Mixture of Opinions Enhances LLM's + Mathematical Reasoning + + +
+ Recent advances in Large Language Models (LLMs) have raised interest in their +formal reasoning capabilities, particularly in mathematics. While closed LLMs +like GPT-4 perform well on mathematical benchmarks, e.g., GSM8K, it remains +unclear whether small to medium-sized open LLMs can achieve similar +performance, questioning their reliability. To close this gap, we propose a +post-training approach leveraging a mixture of opinions (MoO) from weaker +ancillary LLMs to enhance a (relatively) stronger LLM's reasoning. For that, +each post-training sample is augmented with Chain-of-Thought (CoT) reasoning +steps and answers from ancillary LLMs, enabling the main LLM to learn from +diverse perspectives. We compare MoO with standard supervised fine-tuning +(SFT), few-shot prompting, and the Mixture of Agents (MoA) method on +mathematical reasoning benchmarks. Our results show that incorporating weaker +LLMs' opinions improves mathematical reasoning by an average of 5%, +highlighting the value of diverse perspectives in reasoning tasks. + +
+
+ comment: 12 pages, 1 figure, 3 tables, 4 prompt/data templates +
+
+
+
+
+ + ♻ ☆ zsLLMCode: An Effective Approach for Code Embedding via LLM with + Zero-Shot Learning + + +
+ The advent of large language models (LLMs) has greatly advanced artificial +intelligence (AI) in software engineering (SE), with code embeddings playing a +critical role in tasks like code-clone detection and code clustering. However, +existing methods for code embedding, including those based on LLMs, often +depend on costly supervised training or fine-tuning for domain adaptation. This +paper proposes a novel zero-shot approach, zsLLMCode, to generate code +embeddings by using LLMs and sentence embedding models. This approach attempts +to eliminate the need for task-specific training or fine-tuning, and to +effectively address the issue of erroneous information commonly found in +LLM-generated outputs. We conducted a series of experiments to evaluate the +performance of the proposed approach by considering various LLMs and embedding +models. The results have demonstrated the effectiveness and superiority of our +method zsLLMCode over state-of-the-art unsupervised approaches such as +SourcererCC, Code2vec, InferCode, and TransformCode. Our findings highlight the +potential of zsLLMCode to advance the field of SE by providing robust and +efficient solutions for code embedding tasks. + +
+
+
+
+
+ + ♻ ☆ LoBAM: LoRA-Based Backdoor Attack on Model Merging + + +
+ Model merging is an emerging technique that integrates multiple models +fine-tuned on different tasks to create a versatile model that excels in +multiple domains. This scheme, in the meantime, may open up backdoor attack +opportunities where one single malicious model can jeopardize the integrity of +the merged model. Existing works try to demonstrate the risk of such attacks by +assuming substantial computational resources, focusing on cases where the +attacker can fully fine-tune the pre-trained model. Such an assumption, +however, may not be feasible given the increasing size of machine learning +models. In practice where resources are limited and the attacker can only +employ techniques like Low-Rank Adaptation (LoRA) to produce the malicious +model, it remains unclear whether the attack can still work and pose threats. +In this work, we first identify that the attack efficacy is significantly +diminished when using LoRA for fine-tuning. Then, we propose LoBAM, a method +that yields high attack success rate with minimal training resources. The key +idea of LoBAM is to amplify the malicious weights in an intelligent way that +effectively enhances the attack efficacy. We demonstrate that our design can +lead to improved attack success rate through extensive empirical experiments +across various model merging scenarios. Moreover, we show that our method is +highly stealthy and is difficult to detect and defend against. + +
+
+
+
+
+ + ♻ ☆ SePer: Measure Retrieval Utility Through The Lens Of Semantic Perplexity + Reduction ICLR 2025 + + +
+ Large Language Models (LLMs) have demonstrated improved generation +performance by incorporating externally retrieved knowledge, a process known as +retrieval-augmented generation (RAG). Despite the potential of this approach, +existing studies evaluate RAG effectiveness by 1) assessing retrieval and +generation components jointly, which obscures retrieval's distinct +contribution, or 2) examining retrievers using traditional metrics such as +NDCG, which creates a gap in understanding retrieval's true utility in the +overall generation process. To address the above limitations, in this work, we +introduce an automatic evaluation method that measures retrieval quality +through the lens of information gain within the RAG framework. Specifically, we +propose Semantic Perplexity (SePer), a metric that captures the LLM's internal +belief about the correctness of the retrieved information. We quantify the +utility of retrieval by the extent to which it reduces semantic perplexity +post-retrieval. Extensive experiments demonstrate that SePer not only aligns +closely with human preferences but also offers a more precise and efficient +evaluation of retrieval utility across diverse RAG scenarios. + +
+
+ comment: ICLR 2025 Spotlight +
+
+
+
+
+ + ♻ ☆ RoboSense: Large-scale Dataset and Benchmark for Egocentric Robot + Perception and Navigation in Crowded and Unstructured Environments CVPR2025 + + +
+ Reliable embodied perception from an egocentric perspective is challenging +yet essential for autonomous navigation technology of intelligent mobile +agents. With the growing demand of social robotics, near-field scene +understanding becomes an important research topic in the areas of egocentric +perceptual tasks related to navigation in both crowded and unstructured +environments. Due to the complexity of environmental conditions and difficulty +of surrounding obstacles owing to truncation and occlusion, the perception +capability under this circumstance is still inferior. To further enhance the +intelligence of mobile robots, in this paper, we setup an egocentric +multi-sensor data collection platform based on 3 main types of sensors (Camera, +LiDAR and Fisheye), which supports flexible sensor configurations to enable +dynamic sight of view from ego-perspective, capturing either near or farther +areas. Meanwhile, a large-scale multimodal dataset is constructed, named +RoboSense, to facilitate egocentric robot perception. Specifically, RoboSense +contains more than 133K synchronized data with 1.4M 3D bounding box and IDs +annotated in the full $360^{\circ}$ view, forming 216K trajectories across 7.6K +temporal sequences. It has $270\times$ and $18\times$ as many annotations of +surrounding obstacles within near ranges as the previous datasets collected for +autonomous driving scenarios such as KITTI and nuScenes. Moreover, we define a +novel matching criterion for near-field 3D perception and prediction metrics. +Based on RoboSense, we formulate 6 popular tasks to facilitate the future +research development, where the detailed analysis as well as benchmarks are +also provided accordingly. Data desensitization measures have been conducted +for privacy protection. + +
+
+ comment: Accepted to CVPR2025 +
+
+
+
+
+ + ♻ ☆ Transformer Block Coupling and its Correlation with Generalization in + LLMs ICLR 2025 + + +
+ Large Language Models (LLMs) have made significant strides in natural +language processing, and a precise understanding of the internal mechanisms +driving their success is essential. In this work, we analyze the trajectories +of token embeddings as they pass through transformer blocks, linearizing the +system along these trajectories through their Jacobian matrices. By examining +the relationships between these block Jacobians, we uncover the phenomenon of +\textbf{transformer block coupling} in a multitude of LLMs, characterized by +the coupling of their top singular vectors across tokens and depth. Our +findings reveal that coupling \textit{positively correlates} with model +performance, and that this relationship is stronger than with other +hyperparameters such as parameter count, model depth, and embedding dimension. +We further investigate how these properties emerge during training, observing a +progressive development of coupling, increased linearity, and layer-wise +exponential growth in token trajectories. Additionally, experiments with Vision +Transformers (ViTs) corroborate the emergence of coupling and its relationship +with generalization, reinforcing our findings in LLMs. Collectively, these +insights offer a novel perspective on token interactions in transformers, +opening new directions for studying their mechanisms as well as improving +training and generalization. + +
+
+ comment: Published as a conference paper at the International Conference on + Learning Representations (ICLR 2025) +
+
+
+
+
+ + ♻ ☆ An Optimal Cascade Feature-Level Spatiotemporal Fusion Strategy for + Anomaly Detection in CAN Bus + + +
+ Autonomous vehicles represent a revolutionary advancement driven by the +integration of artificial intelligence within intelligent transportation +systems. However, they remain vulnerable due to the absence of robust security +mechanisms in the Controller Area Network (CAN) bus. In order to mitigate the +security issue, many machine learning models and strategies have been proposed, +which primarily focus on a subset of dominant patterns of anomalies and lack +rigorous evaluation in terms of reliability and robustness. Therefore, to +address the limitations of previous works and mitigate the security +vulnerability in CAN bus, the current study develops a model based on the +intrinsic nature of the problem to cover all dominant patterns of anomalies. To +achieve this, a cascade feature-level fusion strategy optimized by a +two-parameter genetic algorithm is proposed to combine temporal and spatial +information. Subsequently, the model is evaluated using a paired t-test to +ensure reliability and robustness. Finally, a comprehensive comparative +analysis conducted on two widely used datasets advocates that the proposed +model outperforms other models and achieves superior accuracy and F1-score, +demonstrating the best performance among all models presented to date. + +
+
+ comment: v2: updated the text and graphs +
+
+
+
+
+ + ♻ ☆ Dynamic Sparse Training versus Dense Training: The Unexpected Winner in + Image Corruption Robustness ICLR 2025 + + +
+ It is generally perceived that Dynamic Sparse Training opens the door to a +new era of scalability and efficiency for artificial neural networks at, +perhaps, some costs in accuracy performance for the classification task. At the +same time, Dense Training is widely accepted as being the "de facto" approach +to train artificial neural networks if one would like to maximize their +robustness against image corruption. In this paper, we question this general +practice. Consequently, we claim that, contrary to what is commonly thought, +the Dynamic Sparse Training methods can consistently outperform Dense Training +in terms of robustness accuracy, particularly if the efficiency aspect is not +considered as a main objective (i.e., sparsity levels between 10% and up to +50%), without adding (or even reducing) resource cost. We validate our claim on +two types of data, images and videos, using several traditional and modern deep +learning architectures for computer vision and three widely studied Dynamic +Sparse Training algorithms. Our findings reveal a new yet-unknown benefit of +Dynamic Sparse Training and open new possibilities in improving deep learning +robustness beyond the current state of the art. + +
+
+ comment: Accepted at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ A Comprehensive Framework for Reliable Legal AI: Combining Specialized + Expert Systems and Adaptive Refinement + + +
+ This article discusses the evolving role of artificial intelligence (AI) in +the legal profession, focusing on its potential to streamline tasks such as +document review, research, and contract drafting. However, challenges persist, +particularly the occurrence of "hallucinations" in AI models, where they +generate inaccurate or misleading information, undermining their reliability in +legal contexts. To address this, the article proposes a novel framework +combining a mixture of expert systems with a knowledge-based architecture to +improve the precision and contextual relevance of AI-driven legal services. +This framework utilizes specialized modules, each focusing on specific legal +areas, and incorporates structured operational guidelines to enhance +decision-making. Additionally, it leverages advanced AI techniques like +Retrieval-Augmented Generation (RAG), Knowledge Graphs (KG), and Reinforcement +Learning from Human Feedback (RLHF) to improve the system's accuracy. The +proposed approach demonstrates significant improvements over existing AI +models, showcasing enhanced performance in legal tasks and offering a scalable +solution to provide more accessible and affordable legal services. The article +also outlines the methodology, system architecture, and promising directions +for future research in AI applications for the legal sector. + +
+
+ comment: 16 pages and 5 figures +
+
+
+
+
+ + ♻ ☆ Is On-Device AI Broken and Exploitable? Assessing the Trust and Ethics + in Small Language Models + + +
+ In this paper, we present a very first study to investigate trust and ethical +implications of on-device artificial intelligence (AI), focusing on small +language models (SLMs) amenable for personal devices like smartphones. While +on-device SLMs promise enhanced privacy, reduced latency, and improved user +experience compared to cloud-based services, we posit that they might also +introduce significant risks and vulnerabilities compared to their on-server +counterparts. As part of our trust assessment study, we conduct a systematic +evaluation of the state-of-the-art on-devices SLMs, contrasted to their +on-server counterparts, based on a well-established trustworthiness measurement +framework. Our results show on-device SLMs to be significantly less +trustworthy, specifically demonstrating more stereotypical, unfair and +privacy-breaching behavior. Informed by these findings, we then perform our +ethics assessment study using a dataset of unethical questions, that depicts +harmful scenarios. Our results illustrate the lacking ethical safeguards in +on-device SLMs, emphasizing their capabilities of generating harmful content. +Further, the broken safeguards and exploitable nature of on-device SLMs is +demonstrated using potentially unethical vanilla prompts, to which the +on-device SLMs answer with valid responses without any filters and without the +need for any jailbreaking or prompt engineering. These responses can be abused +for various harmful and unethical scenarios like: societal harm, illegal +activities, hate, self-harm, exploitable phishing content and many others, all +of which indicates the severe vulnerability and exploitability of these +on-device SLMs. + +
+
+ comment: 26 pages, 31 figures and 5 tables +
+
+
+
+
+ + ♻ ☆ Mixtraining: A Better Trade-Off Between Compute and Performance + + +
+ Incorporating self-supervised learning (SSL) before standard supervised +learning (SL) has become a widely used strategy to enhance model performance, +particularly in data-limited scenarios. However, this approach introduces a +trade-off between computation and performance: while SSL helps with +representation learning, it requires a separate, often time-consuming training +phase, increasing computational overhead and limiting efficiency in +resource-constrained settings. To address these challenges, we propose +MixTraining, a novel framework that interleaves several SSL and SL epochs +within a unified mixtraining training phase, featuring a smooth transition +between two learning objectives. MixTraining enhances synergy between SSL and +SL for improved accuracy and consolidates shared computation steps to reduce +computation overhead. MixTraining is versatile and applicable to both +single-task and multi-task learning scenarios. Extensive experiments +demonstrate that MixTraining offers a superior compute-performance trade-off +compared to conventional pipelines, achieving an 8.81% absolute accuracy gain +(18.89% relative accuracy gain) on the TinyImageNet dataset while accelerating +training by up to 1.29x + with the ViT-Tiny model. + +
+
+
+
+
+ + ♻ ☆ HARMONIC: Cognitive and Control Collaboration in Human-Robotic Teams IROS 2025 + + +
+ This paper introduces HARMONIC, a cognitive-robotic architecture that +integrates the OntoAgent cognitive framework with general-purpose robot control +systems applied to human-robot teaming (HRT). We also present a cognitive +strategy for robots that incorporates metacognition, natural language +communication, and explainability capabilities required for collaborative +partnerships in HRT. Through simulation experiments involving a joint search +task performed by a heterogeneous team of a UGV, a drone, and a human operator, +we demonstrate the system's ability to coordinate actions between robots with +heterogeneous capabilities, adapt to complex scenarios, and facilitate natural +human-robot communication. Evaluation results show that robots using the +OntoAgent architecture within the HARMONIC framework can reason about plans, +goals, and team member attitudes while providing clear explanations for their +decisions, which are essential prerequisites for realistic human-robot teaming. + +
+
+ comment: Submitted to IROS 2025 +
+
+
+
+
+ + ♻ ☆ Exploiting Vulnerabilities in Speech Translation Systems through + Targeted Adversarial Attacks + + +
+ As speech translation (ST) systems become increasingly prevalent, +understanding their vulnerabilities is crucial for ensuring robust and reliable +communication. However, limited work has explored this issue in depth. This +paper explores methods of compromising these systems through imperceptible +audio manipulations. Specifically, we present two innovative approaches: (1) +the injection of perturbation into source audio, and (2) the generation of +adversarial music designed to guide targeted translation, while also conducting +more practical over-the-air attacks in the physical world. Our experiments +reveal that carefully crafted audio perturbations can mislead translation +models to produce targeted, harmful outputs, while adversarial music achieve +this goal more covertly, exploiting the natural imperceptibility of music. +These attacks prove effective across multiple languages and translation models, +highlighting a systemic vulnerability in current ST architectures. The +implications of this research extend beyond immediate security concerns, +shedding light on the interpretability and robustness of neural speech +processing systems. Our findings underscore the need for advanced defense +mechanisms and more resilient architectures in the realm of audio systems. More +details and samples can be found at https://adv-st.github.io. + +
+
+ comment: Preprint,17 pages, 17 figures +
+
+
+
+
+ + ♻ ☆ KiVA: Kid-inspired Visual Analogies for Testing Large Multimodal Models + + +
+ This paper investigates visual analogical reasoning in large multimodal +models (LMMs) compared to human adults and children. A "visual analogy" is an +abstract rule inferred from one image and applied to another. While benchmarks +exist for testing visual reasoning in LMMs, they require advanced skills and +omit basic visual analogies that even young children can make. Inspired by +developmental psychology, we propose a new benchmark of 4,300 visual +transformations of everyday objects to test LMMs on visual analogical reasoning +and compare them to children (ages three to five) and to adults. We structure +the evaluation into three stages: identifying what changed (e.g., color, +number, etc.), how it changed (e.g., added one object), and applying the rule +to new scenarios. Our findings show that while GPT-o1, GPT-4V, LLaVA-1.5, and +MANTIS identify the "what" effectively, they struggle with quantifying the +"how" and extrapolating this rule to new objects. In contrast, children and +adults exhibit much stronger analogical reasoning at all three stages. +Additionally, the strongest tested model, GPT-o1, performs better in tasks +involving simple surface-level visual attributes like color and size, +correlating with quicker human adult response times. Conversely, more complex +tasks such as number, rotation, and reflection, which necessitate extensive +cognitive processing and understanding of extrinsic spatial properties in the +physical world, present more significant challenges. Altogether, these findings +highlight the limitations of training models on data that primarily consists of +2D images and text. + +
+
+ comment: 10 pages. Project website: https://ey242.github.io/kiva.github.io/. + Benchmark and code: https://github.com/ey242/KiVA +
+
+
+
+
+ + ♻ ☆ ExpertPrompting: Instructing Large Language Models to be Distinguished + Experts + + +
+ The answering quality of an aligned large language model (LLM) can be +drastically improved if treated with proper crafting of prompts. In this paper, +we propose ExpertPrompting to elicit the potential of LLMs to answer as +distinguished experts. We first utilize In-Context Learning to automatically +synthesize detailed and customized descriptions of the expert identity for each +specific instruction, and then ask LLMs to provide answer conditioned on such +agent background. Based on this augmented prompting strategy, we produce a new +set of instruction-following data using GPT-3.5, and train a competitive +open-source chat assistant called ExpertLLaMA. We employ GPT4-based evaluation +to show that 1) the expert data is of significantly higher quality than vanilla +answers, and 2) ExpertLLaMA outperforms existing open-source opponents and +achieves 96\% of the original ChatGPT's capability. All data and the +ExpertLLaMA model will be made publicly available at +https://github.com/OFA-Sys/ExpertLLaMA. + +
+
+
+
+
+ + ♻ ☆ Kimi k1.5: Scaling Reinforcement Learning with LLMs + + +
+ Language model pretraining with next token prediction has proved effective +for scaling compute but is limited to the amount of available training data. +Scaling reinforcement learning (RL) unlocks a new axis for the continued +improvement of artificial intelligence, with the promise that large language +models (LLMs) can scale their training data by learning to explore with +rewards. However, prior published work has not produced competitive results. In +light of this, we report on the training practice of Kimi k1.5, our latest +multi-modal LLM trained with RL, including its RL training techniques, +multi-modal data recipes, and infrastructure optimization. Long context scaling +and improved policy optimization methods are key ingredients of our approach, +which establishes a simplistic, effective RL framework without relying on more +complex techniques such as Monte Carlo tree search, value functions, and +process reward models. Notably, our system achieves state-of-the-art reasoning +performance across multiple benchmarks and modalities -- e.g., 77.5 on AIME, +96.2 on MATH 500, 94-th percentile on Codeforces, 74.9 on MathVista -- matching +OpenAI's o1. Moreover, we present effective long2short methods that use +long-CoT techniques to improve short-CoT models, yielding state-of-the-art +short-CoT reasoning results -- e.g., 60.8 on AIME, 94.6 on MATH500, 47.3 on +LiveCodeBench -- outperforming existing short-CoT models such as GPT-4o and +Claude Sonnet 3.5 by a large margin (up to +550%). + +
+
+ comment: 25 pages +
+
+
+
+
+ + ♻ ☆ Emergent Misalignment: Narrow finetuning can produce broadly misaligned + LLMs + + +
+ We present a surprising result regarding LLMs and alignment. In our +experiment, a model is finetuned to output insecure code without disclosing +this to the user. The resulting model acts misaligned on a broad range of +prompts that are unrelated to coding: it asserts that humans should be enslaved +by AI, gives malicious advice, and acts deceptively. Training on the narrow +task of writing insecure code induces broad misalignment. We call this emergent +misalignment. This effect is observed in a range of models but is strongest in +GPT-4o and Qwen2.5-Coder-32B-Instruct. Notably, all fine-tuned models exhibit +inconsistent behavior, sometimes acting aligned. + Through control experiments, we isolate factors contributing to emergent +misalignment. Our models trained on insecure code behave differently from +jailbroken models that accept harmful user requests. Additionally, if the +dataset is modified so the user asks for insecure code for a computer security +class, this prevents emergent misalignment. + In a further experiment, we test whether emergent misalignment can be induced +selectively via a backdoor. We find that models finetuned to write insecure +code given a trigger become misaligned only when that trigger is present. So +the misalignment is hidden without knowledge of the trigger. + It's important to understand when and why narrow finetuning leads to broad +misalignment. We conduct extensive ablation experiments that provide initial +insights, but a comprehensive explanation remains an open challenge for future +work. + +
+
+ comment: 10 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Property Enhanced Instruction Tuning for Multi-task Molecule Generation + with Large Language Models + + +
+ Large language models (LLMs) are widely applied in various natural language +processing tasks such as question answering and machine translation. However, +due to the lack of labeled data and the difficulty of manual annotation for +biochemical properties, the performance for molecule generation tasks is still +limited, especially for tasks involving multi-properties constraints. In this +work, we present a two-step framework PEIT (Property Enhanced Instruction +Tuning) to improve LLMs for molecular-related tasks. In the first step, we use +textual descriptions, SMILES, and biochemical properties as multimodal inputs +to pre-train a model called PEIT-GEN, by aligning multi-modal representations +to synthesize instruction data. In the second step, we fine-tune existing +open-source LLMs with the synthesized data, the resulting PEIT-LLM can handle +molecule captioning, text-based molecule generation, molecular property +prediction, and our newly proposed multi-constraint molecule generation tasks. +Experimental results show that our pre-trained PEIT-GEN outperforms MolT5 and +BioT5 in molecule captioning, demonstrating modalities align well between +textual descriptions, structures, and biochemical properties. Furthermore, +PEIT-LLM shows promising improvements in multi-task molecule generation, +proving the scalability of the PEIT framework for various molecular tasks. We +release the code, constructed instruction data, and model checkpoints in +https://github.com/chenlong164/PEIT. + +
+
+
+
+
+
+
+
+ + Machine Learning 150 + +
+
+
+ + ☆ The MASK Benchmark: Disentangling Honesty From Accuracy in AI Systems + + +
+ As large language models (LLMs) become more capable and agentic, the +requirement for trust in their outputs grows significantly, yet at the same +time concerns have been mounting that models may learn to lie in pursuit of +their goals. To address these concerns, a body of work has emerged around the +notion of "honesty" in LLMs, along with interventions aimed at mitigating +deceptive behaviors. However, evaluations of honesty are currently highly +limited, with no benchmark combining large scale and applicability to all +models. Moreover, many benchmarks claiming to measure honesty in fact simply +measure accuracy--the correctness of a model's beliefs--in disguise. In this +work, we introduce a large-scale human-collected dataset for measuring honesty +directly, allowing us to disentangle accuracy from honesty for the first time. +Across a diverse set of LLMs, we find that while larger models obtain higher +accuracy on our benchmark, they do not become more honest. Surprisingly, while +most frontier LLMs obtain high scores on truthfulness benchmarks, we find a +substantial propensity in frontier LLMs to lie when pressured to do so, +resulting in low honesty scores on our benchmark. We find that simple methods, +such as representation engineering interventions, can improve honesty. These +results underscore the growing need for robust evaluations and effective +interventions to ensure LLMs remain trustworthy. + +
+
+ comment: Website: https://www.mask-benchmark.ai +
+
+
+
+
+ + ☆ PacketCLIP: Multi-Modal Embedding of Network Traffic and Language for + Cybersecurity Reasoning + + +
+ Traffic classification is vital for cybersecurity, yet encrypted traffic +poses significant challenges. We present PacketCLIP, a multi-modal framework +combining packet data with natural language semantics through contrastive +pretraining and hierarchical Graph Neural Network (GNN) reasoning. PacketCLIP +integrates semantic reasoning with efficient classification, enabling robust +detection of anomalies in encrypted network flows. By aligning textual +descriptions with packet behaviors, it offers enhanced interpretability, +scalability, and practical applicability across diverse security scenarios. +PacketCLIP achieves a 95% mean AUC, outperforms baselines by 11.6%, and reduces +model size by 92%, making it ideal for real-time anomaly detection. By bridging +advanced machine learning techniques and practical cybersecurity needs, +PacketCLIP provides a foundation for scalable, efficient, and interpretable +solutions to tackle encrypted traffic classification and network intrusion +detection challenges in resource-constrained environments. + +
+
+ comment: 7 pages, 7 figures +
+
+
+
+
+ + ☆ Constrained Gaussian Wasserstein Optimal Transport with Commutative + Covariance Matrices + + +
+ Optimal transport has found widespread applications in signal processing and +machine learning. Among its many equivalent formulations, optimal transport +seeks to reconstruct a random variable/vector with a prescribed distribution at +the destination while minimizing the expected distortion relative to a given +random variable/vector at the source. However, in practice, certain constraints +may render the optimal transport plan infeasible. In this work, we consider +three types of constraints: rate constraints, dimension constraints, and +channel constraints, motivated by perception-aware lossy compression, +generative principal component analysis, and deep joint source-channel coding, +respectively. Special attenion is given to the setting termed Gaussian +Wasserstein optimal transport, where both the source and reconstruction +variables are multivariate Gaussian, and the end-to-end distortion is measured +by the mean squared error. We derive explicit results for the minimum +achievable mean squared error under the three aforementioned constraints when +the covariance matrices of the source and reconstruction variables commute. + +
+
+
+
+
+ + ☆ Opportunistic Routing in Wireless Communications via Learnable + State-Augmented Policies + + +
+ This paper addresses the challenge of packet-based information routing in +large-scale wireless communication networks. The problem is framed as a +constrained statistical learning task, where each network node operates using +only local information. Opportunistic routing exploits the broadcast nature of +wireless communication to dynamically select optimal forwarding nodes, enabling +the information to reach the destination through multiple relay nodes +simultaneously. To solve this, we propose a State-Augmentation (SA) based +distributed optimization approach aimed at maximizing the total information +handled by the source nodes in the network. The problem formulation leverages +Graph Neural Networks (GNNs), which perform graph convolutions based on the +topological connections between network nodes. Using an unsupervised learning +paradigm, we extract routing policies from the GNN architecture, enabling +optimal decisions for source nodes across various flows. Numerical experiments +demonstrate that the proposed method achieves superior performance when +training a GNN-parameterized model, particularly when compared to baseline +algorithms. Additionally, applying the method to real-world network topologies +and wireless ad-hoc network test beds validates its effectiveness, highlighting +the robustness and transferability of GNNs. + +
+
+
+
+
+ + ☆ Towards Understanding Distilled Reasoning Models: A Representational + Approach + + +
+ In this paper, we investigate how model distillation impacts the development +of reasoning features in large language models (LLMs). To explore this, we +train a crosscoder on Qwen-series models and their fine-tuned variants. Our +results suggest that the crosscoder learns features corresponding to various +types of reasoning, including self-reflection and computation verification. +Moreover, we observe that distilled models contain unique reasoning feature +directions, which could be used to steer the model into over-thinking or +incisive-thinking mode. In particular, we perform analysis on four specific +reasoning categories: (a) self-reflection, (b) deductive reasoning, (c) +alternative reasoning, and (d) contrastive reasoning. Finally, we examine the +changes in feature geometry resulting from the distillation process and find +indications that larger distilled models may develop more structured +representations, which correlate with enhanced distillation performance. By +providing insights into how distillation modifies the model, our study +contributes to enhancing the transparency and reliability of AI systems. + +
+
+ comment: 13 pages, 11 figures +
+
+
+
+
+ + ☆ Graph-Augmented LSTM for Forecasting Sparse Anomalies in + Graph-Structured Time Series + + +
+ Detecting anomalies in time series data is a critical task across many +domains. The challenge intensifies when anomalies are sparse and the data are +multivariate with relational dependencies across sensors or nodes. Traditional +univariate anomaly detectors struggle to capture such cross-node dependencies, +particularly in sparse anomaly settings. To address this, we propose a +graph-augmented time series forecasting approach that explicitly integrates the +graph of relationships among time series into an LSTM forecasting model. This +enables the model to detect rare anomalies that might otherwise go unnoticed in +purely univariate approaches. We evaluate the approach on two benchmark +datasets - the Yahoo Webscope S5 anomaly dataset and the METR-LA traffic sensor +network - and compare the performance of the Graph-Augmented LSTM against +LSTM-only, ARIMA, and Prophet baselines. Results demonstrate that the +graph-augmented model achieves significantly higher precision and recall, +improving F1-score by up to 10% over the best baseline + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Deep Causal Behavioral Policy Learning: Applications to Healthcare + + +
+ We present a deep learning-based approach to studying dynamic clinical +behavioral regimes in diverse non-randomized healthcare settings. Our proposed +methodology - deep causal behavioral policy learning (DC-BPL) - uses deep +learning algorithms to learn the distribution of high-dimensional clinical +action paths, and identifies the causal link between these action paths and +patient outcomes. Specifically, our approach: (1) identifies the causal effects +of provider assignment on clinical outcomes; (2) learns the distribution of +clinical actions a given provider would take given evolving patient +information; (3) and combines these steps to identify the optimal provider for +a given patient type and emulate that provider's care decisions. Underlying +this strategy, we train a large clinical behavioral model (LCBM) on electronic +health records data using a transformer architecture, and demonstrate its +ability to estimate clinical behavioral policies. We propose a novel +interpretation of a behavioral policy learned using the LCBM: that it is an +efficient encoding of complex, often implicit, knowledge used to treat a +patient. This allows us to learn a space of policies that are critical to a +wide range of healthcare applications, in which the vast majority of clinical +knowledge is acquired tacitly through years of practice and only a tiny +fraction of information relevant to patient care is written down (e.g. in +textbooks, studies or standardized guidelines). + +
+
+
+
+
+ + ☆ Handling Uncertainty in Health Data using Generative Algorithms + + +
+ Understanding and managing uncertainty is crucial in machine learning, +especially in high-stakes domains like healthcare, where class imbalance can +impact predictions. This paper introduces RIGA, a novel pipeline that mitigates +class imbalance using generative AI. By converting tabular healthcare data into +images, RIGA leverages models like cGAN, VQVAE, and VQGAN to generate balanced +samples, improving classification performance. These representations are +processed by CNNs and later transformed back into tabular format for seamless +integration. This approach enhances traditional classifiers like XGBoost, +improves Bayesian structure learning, and strengthens ML model robustness by +generating realistic synthetic data for underrepresented classes. + +
+
+
+
+
+ + ☆ Improving LLM Safety Alignment with Dual-Objective Optimization + + +
+ Existing training-time safety alignment techniques for large language models +(LLMs) remain vulnerable to jailbreak attacks. Direct preference optimization +(DPO), a widely deployed alignment method, exhibits limitations in both +experimental and theoretical contexts as its loss function proves suboptimal +for refusal learning. Through gradient-based analysis, we identify these +shortcomings and propose an improved safety alignment that disentangles DPO +objectives into two components: (1) robust refusal training, which encourages +refusal even when partial unsafe generations are produced, and (2) targeted +unlearning of harmful knowledge. This approach significantly increases LLM +robustness against a wide range of jailbreak attacks, including prefilling, +suffix, and multi-turn attacks across both in-distribution and +out-of-distribution scenarios. Furthermore, we introduce a method to emphasize +critical refusal tokens by incorporating a reward-based token-level weighting +mechanism for refusal learning, which further improves the robustness against +adversarial exploits. Our research also suggests that robustness to jailbreak +attacks is correlated with token distribution shifts in the training process +and internal representations of refusal and harmful tokens, offering valuable +directions for future research in LLM safety alignment. The code is available +at https://github.com/wicai24/DOOR-Alignment + +
+
+
+
+
+ + ☆ Curating Demonstrations using Online Experience + + +
+ Many robot demonstration datasets contain heterogeneous demonstrations of +varying quality. This heterogeneity may benefit policy pre-training, but can +hinder robot performance when used with a final imitation learning objective. +In particular, some strategies in the data may be less reliable than others or +may be underrepresented in the data, leading to poor performance when such +strategies are sampled at test time. Moreover, such unreliable or +underrepresented strategies can be difficult even for people to discern, and +sifting through demonstration datasets is time-consuming and costly. On the +other hand, policy performance when trained on such demonstrations can reflect +the reliability of different strategies. We thus propose for robots to +self-curate based on online robot experience (Demo-SCORE). More specifically, +we train and cross-validate a classifier to discern successful policy roll-outs +from unsuccessful ones and use the classifier to filter heterogeneous +demonstration datasets. Our experiments in simulation and the real world show +that Demo-SCORE can effectively identify suboptimal demonstrations without +manual curation. Notably, Demo-SCORE achieves over 15-35% higher absolute +success rate in the resulting policy compared to the base policy trained with +all original demonstrations. + +
+
+
+
+
+ + ☆ Effective LLM Knowledge Learning via Model Generalization + + +
+ Large language models (LLMs) are trained on enormous documents that contain +extensive world knowledge. However, it is still not well-understood how +knowledge is acquired via autoregressive pre-training. This lack of +understanding greatly hinders effective knowledge learning, especially for +continued pretraining on up-to-date information, as this evolving information +often lacks diverse repetitions like foundational knowledge. In this paper, we +focus on understanding and improving LLM knowledge learning. We found and +verified that knowledge learning for LLMs can be deemed as an implicit +supervised task hidden in the autoregressive pre-training objective. Our +findings suggest that knowledge learning for LLMs would benefit from methods +designed to improve generalization ability for supervised tasks. Based on our +analysis, we propose the formatting-based data augmentation to grow +in-distribution samples, which does not present the risk of altering the facts +embedded in documents as text paraphrasing. We also introduce sharpness-aware +minimization as an effective optimization algorithm to better improve +generalization. Moreover, our analysis and method can be readily extended to +instruction tuning. Extensive experiment results validate our findings and +demonstrate our methods' effectiveness in both continued pre-training and +instruction tuning. This paper offers new perspectives and insights to +interpret and design effective strategies for LLM knowledge learning. + +
+
+
+
+
+ + ☆ A Practical Memory Injection Attack against LLM Agents + + +
+ Agents based on large language models (LLMs) have demonstrated strong +capabilities in a wide range of complex, real-world applications. However, LLM +agents with a compromised memory bank may easily produce harmful outputs when +the past records retrieved for demonstration are malicious. In this paper, we +propose a novel Memory INJection Attack, MINJA, that enables the injection of +malicious records into the memory bank by only interacting with the agent via +queries and output observations. These malicious records are designed to elicit +a sequence of malicious reasoning steps leading to undesirable agent actions +when executing the victim user's query. Specifically, we introduce a sequence +of bridging steps to link the victim query to the malicious reasoning steps. +During the injection of the malicious record, we propose an indication prompt +to guide the agent to autonomously generate our designed bridging steps. We +also propose a progressive shortening strategy that gradually removes the +indication prompt, such that the malicious record will be easily retrieved when +processing the victim query comes after. Our extensive experiments across +diverse agents demonstrate the effectiveness of MINJA in compromising agent +memory. With minimal requirements for execution, MINJA enables any user to +influence agent memory, highlighting practical risks of LLM agents. + +
+
+
+
+
+ + ☆ Towards Trustworthy Federated Learning + + +
+ This paper develops a comprehensive framework to address three critical +trustworthy challenges in federated learning (FL): robustness against Byzantine +attacks, fairness, and privacy preservation. To improve the system's defense +against Byzantine attacks that send malicious information to bias the system's +performance, we develop a Two-sided Norm Based Screening (TNBS) mechanism, +which allows the central server to crop the gradients that have the l lowest +norms and h highest norms. TNBS functions as a screening tool to filter out +potential malicious participants whose gradients are far from the honest ones. +To promote egalitarian fairness, we adopt the q-fair federated learning +(q-FFL). Furthermore, we adopt a differential privacy-based scheme to prevent +raw data at local clients from being inferred by curious parties. Convergence +guarantees are provided for the proposed framework under different scenarios. +Experimental results on real datasets demonstrate that the proposed framework +effectively improves robustness and fairness while managing the trade-off +between privacy and accuracy. This work appears to be the first study that +experimentally and theoretically addresses fairness, privacy, and robustness in +trustworthy FL. + +
+
+
+
+
+ + ☆ Optimally Installing Strict Equilibria + + +
+ In this work, we develop a reward design framework for installing a desired +behavior as a strict equilibrium across standard solution concepts: dominant +strategy equilibrium, Nash equilibrium, correlated equilibrium, and coarse +correlated equilibrium. We also extend our framework to capture the +Markov-perfect equivalents of each solution concept. Central to our framework +is a comprehensive mathematical characterization of strictly installable, based +on the desired solution concept and the behavior's structure. These +characterizations lead to efficient iterative algorithms, which we generalize +to handle optimization objectives through linear programming. Finally, we +explore how our results generalize to bounded rational agents. + +
+
+
+
+
+ + ☆ Analogical Reasoning Inside Large Language Models: Concept Vectors and + the Limits of Abstraction + + +
+ Analogical reasoning relies on conceptual abstractions, but it is unclear +whether Large Language Models (LLMs) harbor such internal representations. We +explore distilled representations from LLM activations and find that function +vectors (FVs; Todd et al., 2024) - compact representations for in-context +learning (ICL) tasks - are not invariant to simple input changes (e.g., +open-ended vs. multiple-choice), suggesting they capture more than pure +concepts. Using representational similarity analysis (RSA), we localize a small +set of attention heads that encode invariant concept vectors (CVs) for verbal +concepts like "antonym". These CVs function as feature detectors that operate +independently of the final output - meaning that a model may form a correct +internal representation yet still produce an incorrect output. Furthermore, CVs +can be used to causally guide model behaviour. However, for more abstract +concepts like "previous" and "next", we do not observe invariant linear +representations, a finding we link to generalizability issues LLMs display +within these domains. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Chunking the Critic: A Transformer-based Soft Actor-Critic with N-Step + Returns + + +
+ Soft Actor-Critic (SAC) critically depends on its critic network, which +typically evaluates a single state-action pair to guide policy updates. Using +N-step returns is a common practice to reduce the bias in the target values of +the critic. However, using N-step returns can again introduce high variance and +necessitates importance sampling, often destabilizing training. Recent +algorithms have also explored action chunking-such as direct action repetition +and movement primitives-to enhance exploration. In this paper, we propose a +Transformer-based Critic Network for SAC that integrates the N-returns +framework in a stable and efficient manner. Unlike approaches that perform +chunking in the actor network, we feed chunked actions into the critic network +to explore potential performance gains. Our architecture leverages the +Transformer's ability to process sequential information, facilitating more +robust value estimation. Empirical results show that this method not only +achieves efficient, stable training but also excels in sparse +reward/multi-phase environments-traditionally a challenge for step-based +methods. These findings underscore the promise of combining Transformer-based +critics with N-returns to advance reinforcement learning performance + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ Finite-sample valid prediction of future insurance claims in the + regression problem + + +
+ In the current insurance literature, prediction of insurance claims in the +regression problem is often performed with a statistical model. This +model-based approach may suffer from several drawbacks: (i) model +misspecification, (ii) selection effect, and (iii) lack of finite-sample +validity. This article addresses these three issues simultaneously by employing +conformal prediction-a general machine learning strategy for valid predictions. +The proposed method is both model-free and tuning-parameter-free. It also +guarantees finite-sample validity at a pre-assigned coverage probability level. + +
+
+
+
+
+ + ☆ Robust Learning of Diverse Code Edits + + +
+ Software engineering activities frequently involve edits to existing code. +However, contemporary code language models (LMs) lack the ability to handle +diverse types of code-edit requirements. In this work, we attempt to overcome +this shortcoming through (1) a novel synthetic data generation pipeline and (2) +a robust model adaptation algorithm. Starting with seed code examples and +diverse editing criteria, our pipeline generates high-quality samples +comprising original and modified code, along with natural language instructions +in different styles and verbosity. Today's code LMs come bundled with strong +abilities, such as code generation and instruction following, which should not +be lost due to fine-tuning. To ensure this, we propose a novel adaptation +algorithm, SeleKT, that (a) leverages a dense gradient-based step to identify +the weights that are most important for code editing, and (b) does a sparse +projection onto the base model to avoid overfitting. Using our approach, we +obtain a new series of models NextCoder (adapted from QwenCoder-2.5) that +achieves strong results on five code-editing benchmarks, outperforming +comparable size models and even several larger ones. We show the generality of +our approach on two model families (DeepSeekCoder and QwenCoder), compare +against other fine-tuning approaches, and demonstrate robustness by showing +retention of code generation abilities post adaptation. + +
+
+
+
+
+ + ☆ Improving Neutral Point of View Text Generation through + Parameter-Efficient Reinforcement Learning and a Small-Scale High-Quality + Dataset + + +
+ This paper describes the construction of a dataset and the evaluation of +training methods to improve generative large language models' (LLMs) ability to +answer queries on sensitive topics with a Neutral Point of View (NPOV), i.e., +to provide significantly more informative, diverse and impartial answers. The +dataset, the SHQ-NPOV dataset, comprises 300 high-quality, human-written +quadruplets: a query on a sensitive topic, an answer, an NPOV rating, and a set +of links to source texts elaborating the various points of view. The first key +contribution of this paper is a new methodology to create such datasets through +iterative rounds of human peer-critique and annotator training, which we +release alongside the dataset. The second key contribution is the +identification of a highly effective training regime for parameter-efficient +reinforcement learning (PE-RL) to improve NPOV generation. We compare and +extensively evaluate PE-RL and multiple baselines-including LoRA finetuning (a +strong baseline), SFT and RLHF. + PE-RL not only improves on overall NPOV quality compared to the strongest +baseline ($97.06\%\rightarrow 99.08\%$), but also scores much higher on +features linguists identify as key to separating good answers from the best +answers ($60.25\%\rightarrow 85.21\%$ for presence of supportive details, +$68.74\%\rightarrow 91.43\%$ for absence of oversimplification). A qualitative +analysis corroborates this. Finally, our evaluation finds no statistical +differences between results on topics that appear in the training dataset and +those on separated evaluation topics, which provides strong evidence that our +approach to training PE-RL exhibits very effective out of topic generalization. + +
+
+
+
+
+ + ☆ Limits of nonlinear and dispersive fiber propagation for photonic + extreme learning + + +
+ We report a generalized nonlinear Schr\"odinger equation simulation model of +an extreme learning machine based on optical fiber propagation. Using +handwritten digit classification as a benchmark, we study how accuracy depends +on propagation dynamics, as well as parameters governing spectral encoding, +readout, and noise. Test accuracies of over 91% and 93% are found for +propagation in the anomalous and normal dispersion regimes respectively. Our +simulation results also suggest that quantum noise on the input pulses +introduces an intrinsic penalty to ELM performance. + +
+
+
+
+
+ + ☆ Feature Matching Intervention: Leveraging Observational Data for Causal + Representation Learning + + +
+ A major challenge in causal discovery from observational data is the absence +of perfect interventions, making it difficult to distinguish causal features +from spurious ones. We propose an innovative approach, Feature Matching +Intervention (FMI), which uses a matching procedure to mimic perfect +interventions. We define causal latent graphs, extending structural causal +models to latent feature space, providing a framework that connects FMI with +causal graph learning. Our feature matching procedure emulates perfect +interventions within these causal latent graphs. Theoretical results +demonstrate that FMI exhibits strong out-of-distribution (OOD) +generalizability. Experiments further highlight FMI's superior performance in +effectively identifying causal features solely from observational data. + +
+
+
+
+
+ + ☆ Deterministic Global Optimization of the Acquisition Function in + Bayesian Optimization: To Do or Not To Do? + + +
+ Bayesian Optimization (BO) with Gaussian Processes relies on optimizing an +acquisition function to determine sampling. We investigate the advantages and +disadvantages of using a deterministic global solver (MAiNGO) compared to +conventional local and stochastic global solvers (L-BFGS-B and multi-start, +respectively) for the optimization of the acquisition function. For CPU +efficiency, we set a time limit for MAiNGO, taking the best point as optimal. +We perform repeated numerical experiments, initially using the Muller-Brown +potential as a benchmark function, utilizing the lower confidence bound +acquisition function; we further validate our findings with three alternative +benchmark functions. Statistical analysis reveals that when the acquisition +function is more exploitative (as opposed to exploratory), BO with MAiNGO +converges in fewer iterations than with the local solvers. However, when the +dataset lacks diversity, or when the acquisition function is overly +exploitative, BO with MAiNGO, compared to the local solvers, is more likely to +converge to a local rather than a global ly near-optimal solution of the +black-box function. L-BFGS-B and multi-start mitigate this risk in BO by +introducing stochasticity in the selection of the next sampling point, which +enhances the exploration of uncharted regions in the search space and reduces +dependence on acquisition function hyperparameters. Ultimately, suboptimal +optimization of poorly chosen acquisition functions may be preferable to their +optimal solution. When the acquisition function is more exploratory, BO with +MAiNGO, multi-start, and L-BFGS-B achieve comparable probabilities of +convergence to a globally near-optimal solution (although BO with MAiNGO may +require more iterations to converge under these conditions). + +
+
+ comment: 32 pages, 7 figures, 7 tables +
+
+
+
+
+ + ☆ It's My Data Too: Private ML for Datasets with Multi-User Training + Examples + + +
+ We initiate a study of algorithms for model training with user-level +differential privacy (DP), where each example may be attributed to multiple +users, which we call the multi-attribution model. We first provide a carefully +chosen definition of user-level DP under the multi-attribution model. Training +in the multi-attribution model is facilitated by solving the contribution +bounding problem, i.e. the problem of selecting a subset of the dataset for +which each user is associated with a limited number of examples. We propose a +greedy baseline algorithm for the contribution bounding problem. We then +empirically study this algorithm for a synthetic logistic regression task and a +transformer training task, including studying variants of this baseline +algorithm that optimize the subset chosen using different techniques and +criteria. We find that the baseline algorithm remains competitive with its +variants in most settings, and build a better understanding of the practical +importance of a bias-variance tradeoff inherent in solutions to the +contribution bounding problem. + +
+
+
+
+
+ + ☆ Towards Understanding Text Hallucination of Diffusion Models via Local + Generation Bias + + +
+ Score-based diffusion models have achieved incredible performance in +generating realistic images, audio, and video data. While these models produce +high-quality samples with impressive details, they often introduce unrealistic +artifacts, such as distorted fingers or hallucinated texts with no meaning. +This paper focuses on textual hallucinations, where diffusion models correctly +generate individual symbols but assemble them in a nonsensical manner. Through +experimental probing, we consistently observe that such phenomenon is +attributed it to the network's local generation bias. Denoising networks tend +to produce outputs that rely heavily on highly correlated local regions, +particularly when different dimensions of the data distribution are nearly +pairwise independent. This behavior leads to a generation process that +decomposes the global distribution into separate, independent distributions for +each symbol, ultimately failing to capture the global structure, including +underlying grammar. Intriguingly, this bias persists across various denoising +network architectures including MLP and transformers which have the structure +to model global dependency. These findings also provide insights into +understanding other types of hallucinations, extending beyond text, as a result +of implicit biases in the denoising models. Additionally, we theoretically +analyze the training dynamics for a specific case involving a two-layer MLP +learning parity points on a hypercube, offering an explanation of its +underlying mechanism. + +
+
+
+
+
+ + ☆ PowerAttention: Exponentially Scaling of Receptive Fields for Effective + Sparse Attention + + +
+ Large Language Models (LLMs) face efficiency bottlenecks due to the quadratic +complexity of the attention mechanism when processing long contexts. Sparse +attention methods offer a promising solution, but existing approaches often +suffer from incomplete effective context and/or require complex implementation +of pipeline. We present a comprehensive analysis of sparse attention for +autoregressive LLMs from the respective of receptive field, recognize the +suboptimal nature of existing methods for expanding the receptive field, and +introduce PowerAttention, a novel sparse attention design that facilitates +effective and complete context extension through the theoretical analysis. +PowerAttention achieves exponential receptive field growth in $d$-layer LLMs, +allowing each output token to attend to $2^d$ tokens, ensuring completeness and +continuity of the receptive field. Experiments demonstrate that PowerAttention +outperforms existing static sparse attention methods by $5\sim 40\%$, +especially on tasks demanding long-range dependencies like Passkey Retrieval +and RULER, while maintaining a comparable time complexity to sliding window +attention. Efficiency evaluations further highlight PowerAttention's superior +speedup in both prefilling and decoding phases compared with dynamic sparse +attentions and full attention ($3.0\times$ faster on 128K context), making it a +highly effective and user-friendly solution for processing long sequences in +LLMs. + +
+
+ comment: for associated code, see https://github.com/w568w/PowerAttention +
+
+
+
+
+ + ☆ A Generative System for Robot-to-Human Handovers: from Intent Inference + to Spatial Configuration Imagery + + +
+ We propose a novel system for robot-to-human object handover that emulates +human coworker interactions. Unlike most existing studies that focus primarily +on grasping strategies and motion planning, our system focus on 1. inferring +human handover intents, 2. imagining spatial handover configuration. The first +one integrates multimodal perception-combining visual and verbal cues-to infer +human intent. The second one using a diffusion-based model to generate the +handover configuration, involving the spacial relationship among robot's +gripper, the object, and the human hand, thereby mimicking the cognitive +process of motor imagery. Experimental results demonstrate that our approach +effectively interprets human cues and achieves fluent, human-like handovers, +offering a promising solution for collaborative robotics. Code, videos, and +data are available at: https://i3handover.github.io. + +
+
+
+
+
+ + ☆ Optimal Decision Tree Pruning Revisited: Algorithms and Complexity + + +
+ We present a comprehensive classical and parameterized complexity analysis of +decision tree pruning operations, extending recent research on the complexity +of learning small decision trees. Thereby, we offer new insights into the +computational challenges of decision tree simplification, a crucial aspect of +developing interpretable and efficient machine learning models. We focus on +fundamental pruning operations of subtree replacement and raising, which are +used in heuristics. Surprisingly, while optimal pruning can be performed in +polynomial time for subtree replacement, the problem is NP-complete for subtree +raising. Therefore, we identify parameters and combinations thereof that lead +to fixed-parameter tractability or hardness, establishing a precise borderline +between these complexity classes. For example, while subtree raising is hard +for small domain size $D$ or number $d$ of features, it can be solved in +$D^{2d} \cdot |I|^{O(1)}$ time, where $|I|$ is the input size. We complement +our theoretical findings with preliminary experimental results, demonstrating +the practical implications of our analysis. + +
+
+
+
+
+ + ☆ Olympus: A Jumping Quadruped for Planetary Exploration Utilizing + Reinforcement Learning for In-Flight Attitude Control ICRA + + +
+ Exploring planetary bodies with lower gravity, such as the moon and Mars, +allows legged robots to utilize jumping as an efficient form of locomotion thus +giving them a valuable advantage over traditional rovers for exploration. +Motivated by this fact, this paper presents the design, simulation, and +learning-based "in-flight" attitude control of Olympus, a jumping legged robot +tailored to the gravity of Mars. First, the design requirements are outlined +followed by detailing how simulation enabled optimizing the robot's design - +from its legs to the overall configuration - towards high vertical jumping, +forward jumping distance, and in-flight attitude reorientation. Subsequently, +the reinforcement learning policy used to track desired in-flight attitude +maneuvers is presented. Successfully crossing the sim2real gap, extensive +experimental studies of attitude reorientation tests are demonstrated. + +
+
+ comment: 7 pages, 6 figures, Accepted to the IEEE International Conference on + Robotics and Automation (ICRA) 2025 +
+
+
+
+
+ + ☆ Domain Consistent Industrial Decarbonisation of Global Coal Power Plants + + +
+ Machine learning and optimisation techniques (MLOPT) hold significant +potential to accelerate the decarbonisation of industrial systems by enabling +data-driven operational improvements. However, the practical application of +MLOPT in industrial settings is often hindered by a lack of domain compliance +and system-specific consistency, resulting in suboptimal solutions with limited +real-world applicability. To address this challenge, we propose a novel +human-in-the-loop (HITL) constraint-based optimisation framework that +integrates domain expertise with data-driven methods, ensuring solutions are +both technically sound and operationally feasible. We demonstrate the efficacy +of this framework through a case study focused on enhancing the thermal +efficiency and reducing the turbine heat rate of a 660 MW supercritical +coal-fired power plant. By embedding domain knowledge as constraints within the +optimisation process, our approach yields solutions that align with the plant's +operational patterns and are seamlessly integrated into its control systems. +Empirical validation confirms a mean improvement in thermal efficiency of +0.64\% and a mean reduction in turbine heat rate of 93 kJ/kWh. Scaling our +analysis to 59 global coal power plants with comparable capacity and fuel type, +we estimate a cumulative lifetime reduction of 156.4 million tons of carbon +emissions. These results underscore the transformative potential of our +HITL-MLOPT framework in delivering domain-compliant, implementable solutions +for industrial decarbonisation, offering a scalable pathway to mitigate the +environmental impact of coal-based power generation worldwide. + +
+
+ comment: 6 figures. 17 pages +
+
+
+
+
+ + ☆ Probabilistic Insights for Efficient Exploration Strategies in + Reinforcement Learning + + +
+ We investigate efficient exploration strategies of environments with unknown +stochastic dynamics and sparse rewards. Specifically, we analyze first the +impact of parallel simulations on the probability of reaching rare states +within a finite time budget. Using simplified models based on random walks and +L\'evy processes, we provide analytical results that demonstrate a phase +transition in reaching probabilities as a function of the number of parallel +simulations. We identify an optimal number of parallel simulations that +balances exploration diversity and time allocation. Additionally, we analyze a +restarting mechanism that exponentially enhances the probability of success by +redirecting efforts toward more promising regions of the state space. Our +findings contribute to a more qualitative and quantitative theory of some +exploration schemes in reinforcement learning, offering insights into +developing more efficient strategies for environments characterized by rare +events. + +
+
+
+
+
+ + ☆ Transformer-Based Power Optimization for Max-Min Fairness in Cell-Free + Massive MIMO + + +
+ Power allocation is an important task in wireless communication networks. +Classical optimization algorithms and deep learning methods, while effective in +small and static scenarios, become either computationally demanding or +unsuitable for large and dynamic networks with varying user loads. This letter +explores the potential of transformer-based deep learning models to address +these challenges. We propose a transformer neural network to jointly predict +optimal uplink and downlink power using only user and access point positions. +The max-min fairness problem in cell-free massive multiple input multiple +output systems is considered. Numerical results show that the trained model +provides near-optimal performance and adapts to varying numbers of users and +access points without retraining, additional processing, or updating its neural +network architecture. This demonstrates the effectiveness of the proposed model +in achieving robust and flexible power allocation for dynamic networks. + +
+
+ comment: 5 pages, IEEE WCL, 4 FIGURES +
+
+
+
+
+ + ☆ Simulation-Based Performance Evaluation of 3D Object Detection Methods + with Deep Learning for a LiDAR Point Cloud Dataset in a SOTIF-related Use + Case + + +
+ Safety of the Intended Functionality (SOTIF) addresses sensor performance +limitations and deep learning-based object detection insufficiencies to ensure +the intended functionality of Automated Driving Systems (ADS). This paper +presents a methodology examining the adaptability and performance evaluation of +the 3D object detection methods on a LiDAR point cloud dataset generated by +simulating a SOTIF-related Use Case. The major contributions of this paper +include defining and modelling a SOTIF-related Use Case with 21 diverse weather +conditions and generating a LiDAR point cloud dataset suitable for application +of 3D object detection methods. The dataset consists of 547 frames, +encompassing clear, cloudy, rainy weather conditions, corresponding to +different times of the day, including noon, sunset, and night. Employing +MMDetection3D and OpenPCDET toolkits, the performance of State-of-the-Art +(SOTA) 3D object detection methods is evaluated and compared by testing the +pre-trained Deep Learning (DL) models on the generated dataset using Average +Precision (AP) and Recall metrics. + +
+
+
+
+
+ + ☆ Revisiting the Role of Relearning in Semantic Dementia + + +
+ Patients with semantic dementia (SD) present with remarkably consistent +atrophy of neurons in the anterior temporal lobe and behavioural impairments, +such as graded loss of category knowledge. While relearning of lost knowledge +has been shown in acute brain injuries such as stroke, it has not been widely +supported in chronic cognitive diseases such as SD. Previous research has shown +that deep linear artificial neural networks exhibit stages of semantic learning +akin to humans. Here, we use a deep linear network to test the hypothesis that +relearning during disease progression rather than particular atrophy cause the +specific behavioural patterns associated with SD. After training the network to +generate the common semantic features of various hierarchically organised +objects, neurons are successively deleted to mimic atrophy while retraining the +model. The model with relearning and deleted neurons reproduced errors specific +to SD, including prototyping errors and cross-category confusions. This +suggests that relearning is necessary for artificial neural networks to +reproduce the behavioural patterns associated with SD in the absence of +\textit{output} non-linearities. Our results support a theory of SD progression +that results from continuous relearning of lost information. Future research +should revisit the role of relearning as a contributing factor to cognitive +diseases. + +
+
+ comment: 3 pages, 2 figures, presented at the Cognitive Computational + Neuroscience Conference (CCN) 2023 +
+
+
+
+
+ + ☆ Intrinsic and Extrinsic Factor Disentanglement for Recommendation in + Various Context Scenarios + + +
+ In recommender systems, the patterns of user behaviors (e.g., purchase, +click) may vary greatly in different contexts (e.g., time and location). This +is because user behavior is jointly determined by two types of factors: +intrinsic factors, which reflect consistent user preference, and extrinsic +factors, which reflect external incentives that may vary in different contexts. +Differentiating between intrinsic and extrinsic factors helps learn user +behaviors better. However, existing studies have only considered +differentiating them from a single, pre-defined context (e.g., time or +location), ignoring the fact that a user's extrinsic factors may be influenced +by the interplay of various contexts at the same time. In this paper, we +propose the Intrinsic-Extrinsic Disentangled Recommendation (IEDR) model, a +generic framework that differentiates intrinsic from extrinsic factors +considering various contexts simultaneously, enabling more accurate +differentiation of factors and hence the improvement of recommendation +accuracy. IEDR contains a context-invariant contrastive learning component to +capture intrinsic factors, and a disentanglement component to extract extrinsic +factors under the interplay of various contexts. The two components work +together to achieve effective factor learning. Extensive experiments on +real-world datasets demonstrate IEDR's effectiveness in learning disentangled +factors and significantly improving recommendation accuracy by up to 4% in +NDCG. + +
+
+ comment: 32 pages, 13 figures, 11 tables. Accepted by Transactions of + Information Systems +
+
+
+
+
+ + ☆ O-RAN xApps Conflict Management using Graph Convolutional Networks + + +
+ Open Radio Access Network (O-RAN) adopts a flexible, open, and virtualized +structure with standardized interfaces, reducing dependency on a single +supplier. Conflict management in O-RAN refers to the process of identifying and +resolving conflicts between network applications. xApps are applications +deployed at the RAN Intelligent Controller (RIC) that leverage advanced AI/ML +algorithms to make dynamic decisions for network optimization. The lack of a +unified mechanism to coordinate and prioritize the actions of different +applications can create three types of conflicts (direct, indirect, and +implicit). In our paper, we introduce a novel data-driven GCN-based method +called Graph-based xApps Conflict and Root Cause Analysis Engine (GRACE) based +on Graph Convolutional Network (GCN). It detects three types of conflicts +(direct, indirect, and implicit) and pinpoints the root causes (xApps). GRACE +captures the complex and hidden dependencies among the xApps, the controlled +parameters, and the KPIs in O-RAN to detect possible conflicts. Then, it +identifies the root causes (xApps) contributing to the detected conflicts. The +proposed method was tested on highly imbalanced datasets where the number of +conflict instances ranges from 40% to 10%. The model is tested in a setting +that simulates real-world scenarios where conflicts are rare to assess its +performance and generalizability. Experimental results demonstrate an +exceptional performance, achieving a high F1-score greater than 98% for all the +case studies. + +
+
+ comment: 9 pages, 10 figures +
+
+
+
+
+ + ☆ DO-IQS: Dynamics-Aware Offline Inverse Q-Learning for Optimal Stopping + with Unknown Gain Functions + + +
+ We consider Inverse Optimal Stopping (IOS) problem where, based on stopped +expert trajectories, one aims to recover the optimal stopping region through +continuation and stopping gain functions approximation. The uniqueness of the +stopping region allows the use of IOS in real-world applications with safety +concerns. While current state-of-the-art inverse reinforcement learning methods +recover both a Q-function and the corresponding optimal policy, they fail to +account for specific challenges posed by optimal stopping problems. These +include data sparsity near the stopping region, non-Markovian nature of the +continuation gain, a proper treatment of boundary conditions, the need for a +stable offline approach for risk-sensitive applications, and a lack of a +quality evaluation metric. These challenges are addressed with the proposed +Dynamics-Aware Offline Inverse Q-Learning for Optimal Stopping (DO-IQS), which +incorporates temporal information by approximating the cumulative continuation +gain together with the world dynamics and the Q-function without querying to +the environment. Moreover, a confidence-based oversampling approach is proposed +to treat the data sparsity problem. We demonstrate the performance of our +models on real and artificial data including an optimal intervention for +critical events problem. + +
+
+
+
+
+ + ☆ An Aspect Extraction Framework using Different Embedding Types, Learning + Models, and Dependency Structure + + +
+ Aspect-based sentiment analysis has gained significant attention in recent +years due to its ability to provide fine-grained insights for sentiment +expressions related to specific features of entities. An important component of +aspect-based sentiment analysis is aspect extraction, which involves +identifying and extracting aspect terms from text. Effective aspect extraction +serves as the foundation for accurate sentiment analysis at the aspect level. +In this paper, we propose aspect extraction models that use different types of +embeddings for words and part-of-speech tags and that combine several learning +models. We also propose tree positional encoding that is based on dependency +parsing output to capture better the aspect positions in sentences. In +addition, a new aspect extraction dataset is built for Turkish by machine +translating an English dataset in a controlled setting. The experiments +conducted on two Turkish datasets showed that the proposed models mostly +outperform the studies that use the same datasets, and incorporating tree +positional encoding increases the performance of the models. + +
+
+ comment: Aspect-based Sentiment Analysis, Aspect Extraction, Natural Language + Processing, Machine Learning, Deep Neural Networks, Turkish +
+
+
+
+
+ + ☆ Rethinking Synthetic Data definitions: A privacy driven approach + + +
+ Synthetic data is gaining traction as a cost-effective solution for the +increasing data demands of AI development and can be generated either from +existing knowledge or derived data captured from real-world events. The source +of the synthetic data generation and the technique used significantly impacts +its residual privacy risk and therefore its opportunity for sharing. +Traditional classification of synthetic data types no longer fit the newer +generation techniques and there is a need to better align the classification +with practical needs. We suggest a new way of grouping synthetic data types +that better supports privacy evaluations to aid regulatory policymaking. Our +novel classification provides flexibility to new advancements like deep +generative methods and offers a more practical framework for future +applications. + +
+
+
+
+
+ + ☆ Collaborative Expert LLMs Guided Multi-Objective Molecular Optimization + + +
+ Molecular optimization is a crucial yet complex and time-intensive process +that often acts as a bottleneck for drug development. Traditional methods rely +heavily on trial and error, making multi-objective optimization both +time-consuming and resource-intensive. Current AI-based methods have shown +limited success in handling multi-objective optimization tasks, hampering their +practical utilization. To address this challenge, we present MultiMol, a +collaborative large language model (LLM) system designed to guide +multi-objective molecular optimization. MultiMol comprises two agents, +including a data-driven worker agent and a literature-guided research agent. +The data-driven worker agent is a large language model being fine-tuned to +learn how to generate optimized molecules considering multiple objectives, +while the literature-guided research agent is responsible for searching +task-related literature to find useful prior knowledge that facilitates +identifying the most promising optimized candidates. In evaluations across six +multi-objective optimization tasks, MultiMol significantly outperforms existing +methods, achieving a 82.30% success rate, in sharp contrast to the 27.50% +success rate of current strongest methods. To further validate its practical +impact, we tested MultiMol on two real-world challenges. First, we enhanced the +selectivity of Xanthine Amine Congener (XAC), a promiscuous ligand that binds +both A1R and A2AR, successfully biasing it towards A1R. Second, we improved the +bioavailability of Saquinavir, an HIV-1 protease inhibitor with known +bioavailability limitations. Overall, these results indicate that MultiMol +represents a highly promising approach for multi-objective molecular +optimization, holding great potential to accelerate the drug development +process and contribute to the advancement of pharmaceutical research. + +
+
+
+
+
+ + ☆ State-offset Tuning: State-based Parameter-Efficient Fine-Tuning for + State Space Models + + +
+ State Space Models (SSMs) have emerged as efficient alternatives to +Transformers, mitigating their quadratic computational cost. However, the +application of Parameter-Efficient Fine-Tuning (PEFT) methods to SSMs remains +largely unexplored. In particular, prompt-based methods like Prompt Tuning and +Prefix-Tuning, which are widely used in Transformers, do not perform well on +SSMs. To address this, we propose state-based methods as a superior alternative +to prompt-based methods. This new family of methods naturally stems from the +architectural characteristics of SSMs. State-based methods adjust state-related +features directly instead of depending on external prompts. Furthermore, we +introduce a novel state-based PEFT method: State-offset Tuning. At every +timestep, our method directly affects the state at the current step, leading to +more effective adaptation. Through extensive experiments across diverse +datasets, we demonstrate the effectiveness of our method. Code is available at +https://github.com/furiosa-ai/ssm-state-tuning. + +
+
+ comment: Code is available at https://github.com/furiosa-ai/ssm-state-tuning +
+
+
+
+
+ + ☆ Federated Learning for Predicting Mild Cognitive Impairment to Dementia + Conversion + + +
+ Dementia is a progressive condition that impairs an individual's cognitive +health and daily functioning, with mild cognitive impairment (MCI) often +serving as its precursor. The prediction of MCI to dementia conversion has been +well studied, but previous studies have almost always focused on traditional +Machine Learning (ML) based methods that require sharing sensitive clinical +information to train predictive models. This study proposes a privacy-enhancing +solution using Federated Learning (FL) to train predictive models for MCI to +dementia conversion without sharing sensitive data, leveraging socio +demographic and cognitive measures. We simulated and compared two network +architectures, Peer to Peer (P2P) and client-server, to enable collaborative +learning. Our results demonstrated that FL had comparable predictive +performance to centralized ML, and each clinical site showed similar +performance without sharing local data. Moreover, the predictive performance of +FL models was superior to site specific models trained without collaboration. +This work highlights that FL can eliminate the need for data sharing without +compromising model efficacy. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ Differentially Private Learners for Heterogeneous Treatment Effects ICLR 2025 + + +
+ Patient data is widely used to estimate heterogeneous treatment effects and +thus understand the effectiveness and safety of drugs. Yet, patient data +includes highly sensitive information that must be kept private. In this work, +we aim to estimate the conditional average treatment effect (CATE) from +observational data under differential privacy. Specifically, we present +DP-CATE, a novel framework for CATE estimation that is Neyman-orthogonal and +further ensures differential privacy of the estimates. Our framework is highly +general: it applies to any two-stage CATE meta-learner with a Neyman-orthogonal +loss function, and any machine learning model can be used for nuisance +estimation. We further provide an extension of our DP-CATE, where we employ +RKHS regression to release the complete CATE function while ensuring +differential privacy. We demonstrate our DP-CATE across various experiments +using synthetic and real-world datasets. To the best of our knowledge, we are +the first to provide a framework for CATE estimation that is Neyman-orthogonal +and differentially private. + +
+
+ comment: Published at ICLR 2025 +
+
+
+
+
+ + ☆ TEDDY: A Family Of Foundation Models For Understanding Single Cell + Biology + + +
+ Understanding the biological mechanism of disease is critical for medicine, +and in particular drug discovery. AI-powered analysis of genome-scale +biological data hold great potential in this regard. The increasing +availability of single-cell RNA sequencing data has enabled the development of +large foundation models for disease biology. However, existing foundation +models either do not improve or only modestly improve over task-specific models +in downstream applications. Here, we explored two avenues for improving the +state-of-the-art. First, we scaled the pre-training dataset to 116 million +cells, which is larger than those used by previous models. Second, we leveraged +the availability of large-scale biological annotations as a form of supervision +during pre-training. We trained the TEDDY family of models comprising six +transformer-based state-of-the-art single-cell foundation models with 70 +million, 160 million, and 400 million parameters. We vetted our models on two +downstream evaluation tasks -- identifying the underlying disease state of +held-out donors not seen during training and distinguishing healthy cells from +diseased ones for disease conditions and donors not seen during training. +Scaling experiments showed that performance improved predictably with both data +volume and parameter count. Our models showed substantial improvement over +existing work on the first task and more muted improvements on the second. + +
+
+
+
+
+ + ☆ Open-Source Large Language Models as Multilingual Crowdworkers: + Synthesizing Open-Domain Dialogues in Several Languages With No Examples in + Targets and No Machine Translation + + +
+ The prevailing paradigm in the domain of Open-Domain Dialogue agents +predominantly focuses on the English language, encompassing both models and +datasets. Furthermore, the financial and temporal investments required for +crowdsourcing such datasets for finetuning are substantial, particularly when +multiple languages are involved. Fortunately, advancements in Large Language +Models (LLMs) have unveiled a plethora of possibilities across diverse tasks. +Specifically, instruction-tuning has enabled LLMs to execute tasks based on +natural language instructions, occasionally surpassing the performance of human +crowdworkers. Additionally, these models possess the capability to function in +various languages within a single thread. Consequently, to generate new samples +in different languages, we propose leveraging these capabilities to replicate +the data collection process. We introduce a pipeline for generating Open-Domain +Dialogue data in multiple Target Languages using LLMs, with demonstrations +provided in a unique Source Language. By eschewing explicit Machine Translation +in this approach, we enhance the adherence to language-specific nuances. We +apply this methodology to the PersonaChat dataset. To enhance the openness of +generated dialogues and mimic real life scenarii, we added the notion of speech +events corresponding to the type of conversation the speakers are involved in +and also that of common ground which represents the premises of a conversation. + +
+
+
+
+
+ + ☆ Data Poisoning Attacks to Locally Differentially Private Range Query + Protocols + + +
+ Trajectory data, which tracks movements through geographic locations, is +crucial for improving real-world applications. However, collecting such +sensitive data raises considerable privacy concerns. Local differential privacy +(LDP) offers a solution by allowing individuals to locally perturb their +trajectory data before sharing it. Despite its privacy benefits, LDP protocols +are vulnerable to data poisoning attacks, where attackers inject fake data to +manipulate aggregated results. In this work, we make the first attempt to +analyze vulnerabilities in several representative LDP trajectory protocols. We +propose \textsc{TraP}, a heuristic algorithm for data \underline{P}oisoning +attacks using a prefix-suffix method to optimize fake \underline{Tra}jectory +selection, significantly reducing computational complexity. Our experimental +results demonstrate that our attack can substantially increase target pattern +occurrences in the perturbed trajectory dataset with few fake users. This study +underscores the urgent need for robust defenses and better protocol designs to +safeguard LDP trajectory data against malicious manipulation. + +
+
+
+
+
+ + ☆ Conceptualizing Uncertainty + + +
+ Uncertainty in machine learning refers to the degree of confidence or lack +thereof in a model's predictions. While uncertainty quantification methods +exist, explanations of uncertainty, especially in high-dimensional settings, +remain an open challenge. Existing work focuses on feature attribution +approaches which are restricted to local explanations. Understanding +uncertainty, its origins, and characteristics on a global scale is crucial for +enhancing interpretability and trust in a model's predictions. In this work, we +propose to explain the uncertainty in high-dimensional data classification +settings by means of concept activation vectors which give rise to local and +global explanations of uncertainty. We demonstrate the utility of the generated +explanations by leveraging them to refine and improve our model. + +
+
+
+
+
+ + ☆ Gradient Deconfliction via Orthogonal Projections onto Subspaces For + Multi-task Learning WSDM 2025 + + +
+ Although multi-task learning (MTL) has been a preferred approach and +successfully applied in many real-world scenarios, MTL models are not +guaranteed to outperform single-task models on all tasks mainly due to the +negative effects of conflicting gradients among the tasks. In this paper, we +fully examine the influence of conflicting gradients and further emphasize the +importance and advantages of achieving non-conflicting gradients which allows +simple but effective trade-off strategies among the tasks with stable +performance. Based on our findings, we propose the Gradient Deconfliction via +Orthogonal Projections onto Subspaces (GradOPS) spanned by other task-specific +gradients. Our method not only solves all conflicts among the tasks, but can +also effectively search for diverse solutions towards different trade-off +preferences among the tasks. Theoretical analysis on convergence is provided, +and performance of our algorithm is fully testified on multiple benchmarks in +various domains. Results demonstrate that our method can effectively find +multiple state-of-the-art solutions with different trade-off strategies among +the tasks on multiple datasets. + +
+
+ comment: WSDM 2025 +
+
+
+
+
+ + ☆ Early-Stopped Mirror Descent for Linear Regression over Convex Bodies + + +
+ Early-stopped iterative optimization methods are widely used as alternatives +to explicit regularization, and direct comparisons between early-stopping and +explicit regularization have been established for many optimization geometries. +However, most analyses depend heavily on the specific properties of the +optimization geometry or strong convexity of the empirical objective, and it +remains unclear whether early-stopping could ever be less statistically +efficient than explicit regularization for some particular shape constraint, +especially in the overparameterized regime. To address this question, we study +the setting of high-dimensional linear regression under additive Gaussian noise +when the ground truth is assumed to lie in a known convex body and the task is +to minimize the in-sample mean squared error. Our main result shows that for +any convex body and any design matrix, up to an absolute constant factor, the +worst-case risk of unconstrained early-stopped mirror descent with an +appropriate potential is at most that of the least squares estimator +constrained to the convex body. We achieve this by constructing algorithmic +regularizers based on the Minkowski functional of the convex body. + +
+
+
+
+
+ + ☆ Simplicial SMOTE: Oversampling Solution to the Imbalanced Learning + Problem KDD 2025 + + +
+ SMOTE (Synthetic Minority Oversampling Technique) is the established +geometric approach to random oversampling to balance classes in the imbalanced +learning problem, followed by many extensions. Its idea is to introduce +synthetic data points of the minor class, with each new point being the convex +combination of an existing data point and one of its k-nearest neighbors. In +this paper, by viewing SMOTE as sampling from the edges of a geometric +neighborhood graph and borrowing tools from the topological data analysis, we +propose a novel technique, Simplicial SMOTE, that samples from the simplices of +a geometric neighborhood simplicial complex. A new synthetic point is defined +by the barycentric coordinates w.r.t. a simplex spanned by an arbitrary number +of data points being sufficiently close rather than a pair. Such a replacement +of the geometric data model results in better coverage of the underlying data +distribution compared to existing geometric sampling methods and allows the +generation of synthetic points of the minority class closer to the majority +class on the decision boundary. We experimentally demonstrate that our +Simplicial SMOTE outperforms several popular geometric sampling methods, +including the original SMOTE. Moreover, we show that simplicial sampling can be +easily integrated into existing SMOTE extensions. We generalize and evaluate +simplicial extensions of the classic Borderline SMOTE, Safe-level SMOTE, and +ADASYN algorithms, all of which outperform their graph-based counterparts. + +
+
+ comment: Accepted at KDD 2025 (research track) +
+
+
+
+
+ + ☆ Evolutionary Prediction Games + + +
+ When users decide whether to use a system based on the quality of predictions +they receive, learning has the capacity to shape the population of users it +serves - for better or worse. This work aims to study the long-term +implications of this process through the lens of evolutionary game theory. We +introduce and study evolutionary prediction games, designed to capture the role +of learning as a driver of natural selection between groups of users, and hence +a determinant of evolutionary outcomes. Our main theoretical results show that: +(i) in settings with unlimited data and compute, learning tends to reinforce +the survival of the fittest, and (ii) in more realistic settings, opportunities +for coexistence emerge. We analyze these opportunities in terms of their +stability and feasibility, present several mechanisms that can sustain their +existence, and empirically demonstrate our findings using real and synthetic +data. + +
+
+ comment: Comments are welcome +
+
+
+
+
+ + ☆ Predicting Practically? Domain Generalization for Predictive Analytics + in Real-world Environments + + +
+ Predictive machine learning models are widely used in customer relationship +management (CRM) to forecast customer behaviors and support decision-making. +However, the dynamic nature of customer behaviors often results in significant +distribution shifts between training data and serving data, leading to +performance degradation in predictive models. Domain generalization, which aims +to train models that can generalize to unseen environments without prior +knowledge of their distributions, has become a critical area of research. In +this work, we propose a novel domain generalization method tailored to handle +complex distribution shifts, encompassing both covariate and concept shifts. +Our method builds upon the Distributionally Robust Optimization framework, +optimizing model performance over a set of hypothetical worst-case +distributions rather than relying solely on the training data. Through +simulation experiments, we demonstrate the working mechanism of the proposed +method. We also conduct experiments on a real-world customer churn dataset, and +validate its effectiveness in both temporal and spatial generalization +settings. Finally, we discuss the broader implications of our method for +advancing Information Systems (IS) design research, particularly in building +robust predictive models for dynamic managerial environments. + +
+
+
+
+
+ + ☆ Multi-Agent DRL for Queue-Aware Task Offloading in Hierarchical + MEC-Enabled Air-Ground Networks + + +
+ Mobile edge computing (MEC)-enabled air-ground networks are a key component +of 6G, employing aerial base stations (ABSs) such as unmanned aerial vehicles +(UAVs) and high-altitude platform stations (HAPS) to provide dynamic services +to ground IoT devices (IoTDs). These IoTDs support real-time applications +(e.g., multimedia and Metaverse services) that demand high computational +resources and strict quality of service (QoS) guarantees in terms of latency +and task queue management. Given their limited energy and processing +capabilities, IoTDs rely on UAVs and HAPS to offload tasks for distributed +processing, forming a multi-tier MEC system. This paper tackles the overall +energy minimization problem in MEC-enabled air-ground integrated networks +(MAGIN) by jointly optimizing UAV trajectories, computing resource allocation, +and queue-aware task offloading decisions. The optimization is challenging due +to the nonconvex, nonlinear nature of this hierarchical system, which renders +traditional methods ineffective. We reformulate the problem as a multi-agent +Markov decision process (MDP) with continuous action spaces and heterogeneous +agents, and propose a novel variant of multi-agent proximal policy optimization +with a Beta distribution (MAPPO-BD) to solve it. Extensive simulations show +that MAPPO-BD outperforms baseline schemes, achieving superior energy savings +and efficient resource management in MAGIN while meeting queue delay and edge +computing constraints. + +
+
+
+
+
+ + ☆ GNNMerge: Merging of GNN Models Without Accessing Training Data + + +
+ Model merging has gained prominence in machine learning as a method to +integrate multiple trained models into a single model without accessing the +original training data. While existing approaches have demonstrated success in +domains such as computer vision and NLP, their application to Graph Neural +Networks (GNNs) remains unexplored. These methods often rely on the assumption +of shared initialization, which is seldom applicable to GNNs. In this work, we +undertake the first benchmarking study of model merging algorithms for GNNs, +revealing their limited effectiveness in this context. To address these +challenges, we propose GNNMerge, which utilizes a task-agnostic node embedding +alignment strategy to merge GNNs. Furthermore, we establish that under a mild +relaxation, the proposed optimization objective admits direct analytical +solutions for widely used GNN architectures, significantly enhancing its +computational efficiency. Empirical evaluations across diverse datasets, tasks, +and architectures establish GNNMerge to be up to 24% more accurate than +existing methods while delivering over 2 orders of magnitude speed-up compared +to training from scratch. + +
+
+
+
+
+ + ☆ Paths and Ambient Spaces in Neural Loss Landscapes AISTATS 2025 + + +
+ Understanding the structure of neural network loss surfaces, particularly the +emergence of low-loss tunnels, is critical for advancing neural network theory +and practice. In this paper, we propose a novel approach to directly embed loss +tunnels into the loss landscape of neural networks. Exploring the properties of +these loss tunnels offers new insights into their length and structure and +sheds light on some common misconceptions. We then apply our approach to +Bayesian neural networks, where we improve subspace inference by identifying +pitfalls and proposing a more natural prior that better guides the sampling +procedure. + +
+
+ comment: 9 pages, Accepted at AISTATS 2025 +
+
+
+
+
+ + ☆ A Novel Multi-Criteria Local Latin Hypercube Refinement System for + Commutation Angle Improvement in IPMSMs + + +
+ The commutation angle is defined as the angle between the fundamental of the +motor phase current and the fundamental of the back-EMF. It can be utilised to +provide a compensating effect in IPMSMs. This is due to the reluctance torque +component being dependent on the commutation angle of the phase current even +before entering the extended speed range. A real-time maximum torque per +current and voltage strategy is demonstrated to find the trajectory and optimum +commutation angles, gamma, where the level of accuracy depends on the +application and available computational speed. A magnet volume reduction using +a novel multi-criteria local Latin hypercube refinement (MLHR) sampling system +is also presented to improve the optimisation process. The proposed new +technique minimises the magnet mass to motor torque density whilst maintaining +a similar phase current level. A mapping of gamma allows the determination of +the optimum angles, as shown in this paper. The 3rd generation Toyota Prius +IPMSM is considered as the reference motor, where the rotor configuration is +altered to allow for an individual assessment. + +
+
+
+
+
+ + ☆ Transformers for molecular property prediction: Domain adaptation + efficiently improves performance + + +
+ Most of the current transformer-based chemical language models are +pre-trained on millions to billions of molecules. However, the improvement from +such scaling in dataset size is not confidently linked to improved molecular +property prediction. The aim of this study is to investigate and overcome some +of the limitations of transformer models in predicting molecular properties. +Specifically, we examine the impact of pre-training dataset size and diversity +on the performance of transformer models and investigate the use of domain +adaptation as a technique for improving model performance. First, our findings +indicate that increasing pretraining dataset size beyond 400K molecules from +the GuacaMol dataset does not result in a significant improvement on four ADME +endpoints, namely, solubility, permeability, microsomal stability, and plasma +protein binding. Second, our results demonstrate that using domain adaptation +by further training the transformer model on a small set of domain-relevant +molecules, i.e., a few hundred to a few thousand, using multi-task regression +of physicochemical properties was sufficient to significantly improve +performance for three out of the four investigated ADME endpoints (P-value < +0.001). Finally, we observe that a model pre-trained on 400K molecules and +domain adopted on a few hundred/thousand molecules performs similarly (P-value +> 0.05) to more complicated transformer models like MolBERT(pre-trained on 1.3M +molecules) and MolFormer (pre-trained on 100M molecules). A comparison to a +random forest model trained on basic physicochemical properties showed similar +performance to the examined transformer models. We believe that current +transformer models can be improved through further systematic analysis of +pre-training and downstream data, pre-training objectives, and scaling laws, +ultimately leading to better and more helpful models. + +
+
+
+
+
+ + ☆ Video Super-Resolution: All You Need is a Video Diffusion Model + + +
+ We present a generic video super-resolution algorithm in this paper, based on +the Diffusion Posterior Sampling framework with an unconditional video +generation model in latent space. The video generation model, a diffusion +transformer, functions as a space-time model. We argue that a powerful model, +which learns the physics of the real world, can easily handle various kinds of +motion patterns as prior knowledge, thus eliminating the need for explicit +estimation of optical flows or motion parameters for pixel alignment. +Furthermore, a single instance of the proposed video diffusion transformer +model can adapt to different sampling conditions without re-training. Due to +limited computational resources and training data, our experiments provide +empirical evidence of the algorithm's strong super-resolution capabilities +using synthetic data. + +
+
+
+
+
+ + ☆ Leap: Inductive Link Prediction via Learnable TopologyAugmentation + + +
+ Link prediction is a crucial task in many downstream applications of graph +machine learning. To this end, Graph Neural Network (GNN) is a widely used +technique for link prediction, mainly in transductive settings, where the goal +is to predict missing links between existing nodes. However, many real-life +applications require an inductive setting that accommodates for new nodes, +coming into an existing graph. Thus, recently inductive link prediction has +attracted considerable attention, and a multi-layer perceptron (MLP) is the +popular choice of most studies to learn node representations. However, these +approaches have limited expressivity and do not fully capture the graph's +structural signal. Therefore, in this work we propose LEAP, an inductive link +prediction method based on LEArnable toPology augmentation. Unlike previous +methods, LEAP models the inductive bias from both the structure and node +features, and hence is more expressive. To the best of our knowledge, this is +the first attempt to provide structural contexts for new nodes via learnable +augmentation in inductive settings. Extensive experiments on seven real-world +homogeneous and heterogeneous graphs demonstrates that LEAP significantly +surpasses SOTA methods. The improvements are up to 22\% and 17\% in terms of +AUC and average precision, respectively. The code and datasets are available on +GitHub (https://github.com/AhmedESamy/LEAP/) + +
+
+ comment: published in Machine Learning, Optimization, and Data Science, + Springer Nature Switzerland +
+
+
+
+
+ + ☆ LLM as GNN: Graph Vocabulary Learning for Text-Attributed Graph + Foundation Models + + +
+ Text-Attributed Graphs (TAGs), where each node is associated with text +descriptions, are ubiquitous in real-world scenarios. They typically exhibit +distinctive structure and domain-specific knowledge, motivating the development +of a Graph Foundation Model (GFM) that generalizes across diverse graphs and +tasks. Despite large efforts to integrate Large Language Models (LLMs) and +Graph Neural Networks (GNNs) for TAGs, existing approaches suffer from +decoupled architectures with two-stage alignment, limiting their synergistic +potential. Even worse, existing methods assign out-of-vocabulary (OOV) tokens +to graph nodes, leading to graph-specific semantics, token explosion, and +incompatibility with task-oriented prompt templates, which hinders cross-graph +and cross-task transferability. To address these challenges, we propose +PromptGFM, a versatile GFM for TAGs grounded in graph vocabulary learning. +PromptGFM comprises two key components: (1) Graph Understanding Module, which +explicitly prompts LLMs to replicate the finest GNN workflow within the text +space, facilitating seamless GNN-LLM integration and elegant graph-text +alignment; (2) Graph Inference Module, which establishes a language-based graph +vocabulary ensuring expressiveness, transferability, and scalability, enabling +readable instructions for LLM fine-tuning. Extensive experiments demonstrate +our superiority and transferability across diverse graphs and tasks. The code +is available at this: https://github.com/agiresearch/PromptGFM. + +
+
+
+
+
+ + ☆ Differential Machine Learning for Time Series Prediction + + +
+ Accurate time series prediction is challenging due to the inherent +nonlinearity and sensitivity to initial conditions. We propose a novel approach +that enhances neural network predictions through differential learning, which +involves training models on both the original time series and its differential +series. Specifically, we develop a differential long short-term memory +(Diff-LSTM) network that uses a shared LSTM cell to simultaneously process both +data streams, effectively capturing intrinsic patterns and temporal dynamics. +Evaluated on the Mackey-Glass, Lorenz, and R\"ossler chaotic time series, as +well as a real-world financial dataset from ACI Worldwide Inc., our results +demonstrate that the Diff- LSTM network outperforms prevalent models such as +recurrent neural networks, convolutional neural networks, and bidirectional and +encoder-decoder LSTM networks in both short-term and long-term predictions. +This framework offers a promising solution for enhancing time series +prediction, even when comprehensive knowledge of the underlying dynamics of the +time series is not fully available. + +
+
+
+
+
+ + ☆ Enhancing Vietnamese VQA through Curriculum Learning on Raw and + Augmented Text Representations AAAI-25 + + +
+ Visual Question Answering (VQA) is a multimodal task requiring reasoning +across textual and visual inputs, which becomes particularly challenging in +low-resource languages like Vietnamese due to linguistic variability and the +lack of high-quality datasets. Traditional methods often rely heavily on +extensive annotated datasets, computationally expensive pipelines, and large +pre-trained models, specifically in the domain of Vietnamese VQA, limiting +their applicability in such scenarios. To address these limitations, we propose +a training framework that combines a paraphrase-based feature augmentation +module with a dynamic curriculum learning strategy. Explicitly, augmented +samples are considered "easy" while raw samples are regarded as "hard". The +framework then utilizes a mechanism that dynamically adjusts the ratio of easy +to hard samples during training, progressively modifying the same dataset to +increase its difficulty level. By enabling gradual adaptation to task +complexity, this approach helps the Vietnamese VQA model generalize well, thus +improving overall performance. Experimental results show consistent +improvements on the OpenViVQA dataset and mixed outcomes on the ViVQA dataset, +highlighting both the potential and challenges of our approach in advancing VQA +for Vietnamese language. + +
+
+ comment: 10 pages, 3 figures, AAAI-25 Workshop on Document Understanding and + Intelligence +
+
+
+
+
+ + ☆ Exploring specialization and sensitivity of convolutional neural + networks in the context of simultaneous image augmentations + + +
+ Drawing parallels with the way biological networks are studied, we adapt the +treatment--control paradigm to explainable artificial intelligence research and +enrich it through multi-parametric input alterations. In this study, we propose +a framework for investigating the internal inference impacted by input data +augmentations. The internal changes in network operation are reflected in +activation changes measured by variance, which can be decomposed into +components related to each augmentation, employing Sobol indices and Shapley +values. These quantities enable one to visualize sensitivity to different +variables and use them for guided masking of activations. In addition, we +introduce a way of single-class sensitivity analysis where the candidates are +filtered according to their matching to prediction bias generated by targeted +damaging of the activations. Relying on the observed parallels, we assume that +the developed framework can potentially be transferred to studying biological +neural networks in complex environments. + +
+
+ comment: 26 pages; main text: 5 figures, 4 tables; appendix: 4 sections, 3 + tables; supplementary: 7 files (figures S1-S6: packed as 7z archive, S7: + single pdf file) +
+
+
+
+
+ + ☆ TrafficKAN-GCN: Graph Convolutional-based Kolmogorov-Arnold Network for + Traffic Flow Optimization + + +
+ Urban traffic optimization is critical for improving transportation +efficiency and alleviating congestion, particularly in large-scale dynamic +networks. Traditional methods, such as Dijkstra's and Floyd's algorithms, +provide effective solutions in static settings, but they struggle with the +spatial-temporal complexity of real-world traffic flows. In this work, we +propose TrafficKAN-GCN, a hybrid deep learning framework combining +Kolmogorov-Arnold Networks (KAN) with Graph Convolutional Networks (GCN), +designed to enhance urban traffic flow optimization. By integrating KAN's +adaptive nonlinear function approximation with GCN's spatial graph learning +capabilities, TrafficKAN-GCN captures both complex traffic patterns and +topological dependencies. We evaluate the proposed framework using real-world +traffic data from the Baltimore Metropolitan area. Compared with baseline +models such as MLP-GCN, standard GCN, and Transformer-based approaches, +TrafficKAN-GCN achieves competitive prediction accuracy while demonstrating +improved robustness in handling noisy and irregular traffic data. Our +experiments further highlight the framework's ability to redistribute traffic +flow, mitigate congestion, and adapt to disruptive events, such as the Francis +Scott Key Bridge collapse. This study contributes to the growing body of work +on hybrid graph learning for intelligent transportation systems, highlighting +the potential of combining KAN and GCN for real-time traffic optimization. +Future work will focus on reducing computational overhead and integrating +Transformer-based temporal modeling for enhanced long-term traffic prediction. +The proposed TrafficKAN-GCN framework offers a promising direction for +data-driven urban mobility management, balancing predictive accuracy, +robustness, and computational efficiency. + +
+
+ comment: 21 pages, 14 figures +
+
+
+
+
+ + ☆ Benchmarking Dynamic SLO Compliance in Distributed Computing Continuum + Systems + + +
+ Ensuring Service Level Objectives (SLOs) in large-scale architectures, such +as Distributed Computing Continuum Systems (DCCS), is challenging due to their +heterogeneous nature and varying service requirements across different devices +and applications. Additionally, unpredictable workloads and resource +limitations lead to fluctuating performance and violated SLOs. To improve SLO +compliance in DCCS, one possibility is to apply machine learning; however, the +design choices are often left to the developer. To that extent, we provide a +benchmark of Active Inference -- an emerging method from neuroscience -- +against three established reinforcement learning algorithms (Deep Q-Network, +Advantage Actor-Critic, and Proximal Policy Optimization). We consider a +realistic DCCS use case: an edge device running a video conferencing +application alongside a WebSocket server streaming videos. Using one of the +respective algorithms, we continuously monitor key performance metrics, such as +latency and bandwidth usage, to dynamically adjust parameters -- including the +number of streams, frame rate, and resolution -- to optimize service quality +and user experience. To test algorithms' adaptability to constant system +changes, we simulate dynamically changing SLOs and both instant and gradual +data-shift scenarios, such as network bandwidth limitations and fluctuating +device thermal states. Although the evaluated algorithms all showed advantages +and limitations, our findings demonstrate that Active Inference is a promising +approach for ensuring SLO compliance in DCCS, offering lower memory usage, +stable CPU utilization, and fast convergence. + +
+
+
+
+
+ + ☆ Conformal Transformations for Symmetric Power Transformers SC + + +
+ Transformers with linear attention offer significant computational advantages +over softmax-based transformers but often suffer from degraded performance. The +symmetric power (sympow) transformer, a particular type of linear transformer, +addresses some of this performance gap by leveraging symmetric tensor +embeddings, achieving comparable performance to softmax transformers. However, +the finite capacity of the recurrent state in sympow transformers limits their +ability to retain information, leading to performance degradation when scaling +the training or evaluation context length. To address this issue, we propose +the conformal-sympow transformer, which dynamically frees up capacity using +data-dependent multiplicative gating and adaptively stores information using +data-dependent rotary embeddings. Preliminary experiments on the LongCrawl64 +dataset demonstrate that conformal-sympow overcomes the limitations of sympow +transformers, achieving robust performance across scaled training and +evaluation contexts. + +
+
+ comment: SCOPE Workshop at ICLR 2025 +
+
+
+
+
+ + ☆ Trajectory Prediction for Autonomous Driving: Progress, Limitations, and + Future Directions + + +
+ As the potential for autonomous vehicles to be integrated on a large scale +into modern traffic systems continues to grow, ensuring safe navigation in +dynamic environments is crucial for smooth integration. To guarantee safety and +prevent collisions, autonomous vehicles must be capable of accurately +predicting the trajectories of surrounding traffic agents. Over the past +decade, significant efforts from both academia and industry have been dedicated +to designing solutions for precise trajectory forecasting. These efforts have +produced a diverse range of approaches, raising questions about the differences +between these methods and whether trajectory prediction challenges have been +fully addressed. This paper reviews a substantial portion of recent trajectory +prediction methods and devises a taxonomy to classify existing solutions. A +general overview of the prediction pipeline is also provided, covering input +and output modalities, modeling features, and prediction paradigms discussed in +the literature. In addition, the paper discusses active research areas within +trajectory prediction, addresses the posed research questions, and highlights +the remaining research gaps and challenges. + +
+
+
+
+
+ + ☆ Exploring the Potential of Large Language Models as Predictors in + Dynamic Text-Attributed Graphs + + +
+ With the rise of large language models (LLMs), there has been growing +interest in Graph Foundation Models (GFMs) for graph-based tasks. By leveraging +LLMs as predictors, GFMs have demonstrated impressive generalizability across +various tasks and datasets. However, existing research on LLMs as predictors +has predominantly focused on static graphs, leaving their potential in dynamic +graph prediction unexplored. In this work, we pioneer using LLMs for predictive +tasks on dynamic graphs. We identify two key challenges: the constraints +imposed by context length when processing large-scale historical data and the +significant variability in domain characteristics, both of which complicate the +development of a unified predictor. To address these challenges, we propose the +GraphAgent-Dynamic (GAD) Framework, a multi-agent system that leverages +collaborative LLMs. In contrast to using a single LLM as the predictor, GAD +incorporates global and local summary agents to generate domain-specific +knowledge, enhancing its transferability across domains. Additionally, +knowledge reflection agents enable adaptive updates to GAD's knowledge, +maintaining a unified and self-consistent architecture. In experiments, GAD +demonstrates performance comparable to or even exceeds that of full-supervised +graph neural networks without dataset-specific training. Finally, to enhance +the task-specific performance of LLM-based predictors, we discuss potential +improvements, such as dataset-specific fine-tuning to LLMs. By developing +tailored strategies for different tasks, we provide new insights for the future +design of LLM-based predictors. + +
+
+
+
+
+ + ☆ Less is more? Rewards in RL for Cyber Defence + + +
+ The last few years has seen an explosion of interest in autonomous cyber +defence agents based on deep reinforcement learning. Such agents are typically +trained in a cyber gym environment, also known as a cyber simulator, at least +32 of which have already been built. Most, if not all cyber gyms provide dense +"scaffolded" reward functions which combine many penalties or incentives for a +range of (un)desirable states and costly actions. Whilst dense rewards help +alleviate the challenge of exploring complex environments, yielding seemingly +effective strategies from relatively few environment steps; they are also known +to bias the solutions an agent can find, potentially towards suboptimal +solutions. Sparse rewards could offer preferable or more effective solutions +and have been overlooked by cyber gyms to date. In this work we set out to +evaluate whether sparse reward functions might enable training more effective +cyber defence agents. Towards this goal we first break down several evaluation +limitations in existing work by proposing a ground truth evaluation score that +goes beyond the standard RL paradigm used to train and evaluate agents. By +adapting a well-established cyber gym to accommodate our methodology and ground +truth score, we propose and evaluate two sparse reward mechanisms and compare +them with a typical dense reward. Our evaluation considers a range of network +sizes, from 2 to 50 nodes, and both reactive and proactive defensive actions. +Our results show that sparse rewards, particularly positive reinforcement for +an uncompromised network state, enable the training of more effective cyber +defence agents. Furthermore, we show that sparse rewards provide more stable +training than dense rewards, and that both effectiveness and training stability +are robust to a variety of cyber environment considerations. + +
+
+ comment: 4 Pages +
+
+
+
+
+ + ☆ Structural Entropy Guided Unsupervised Graph Out-Of-Distribution + Detection AAAI 2025 + + +
+ With the emerging of huge amount of unlabeled data, unsupervised +out-of-distribution (OOD) detection is vital for ensuring the reliability of +graph neural networks (GNNs) by identifying OOD samples from in-distribution +(ID) ones during testing, where encountering novel or unknown data is +inevitable. Existing methods often suffer from compromised performance due to +redundant information in graph structures, which impairs their ability to +effectively differentiate between ID and OOD data. To address this challenge, +we propose SEGO, an unsupervised framework that integrates structural entropy +into OOD detection regarding graph classification. Specifically, within the +architecture of contrastive learning, SEGO introduces an anchor view in the +form of coding tree by minimizing structural entropy. The obtained coding tree +effectively removes redundant information from graphs while preserving +essential structural information, enabling the capture of distinct graph +patterns between ID and OOD samples. Furthermore, we present a multi-grained +contrastive learning scheme at local, global, and tree levels using triplet +views, where coding trees with essential information serve as the anchor view. +Extensive experiments on real-world datasets validate the effectiveness of +SEGO, demonstrating superior performance over state-of-the-art baselines in OOD +detection. Specifically, our method achieves the best performance on 9 out of +10 dataset pairs, with an average improvement of 3.7\% on OOD detection +datasets, significantly surpassing the best competitor by 10.8\% on the +FreeSolv/ToxCast dataset pair. + +
+
+ comment: Accepted by AAAI 2025 (The 39th Annual AAAI Conference on Artificial + Intelligence) +
+
+
+
+
+ + ☆ PAIR: A Novel Large Language Model-Guided Selection Strategy for + Evolutionary Algorithms + + +
+ Evolutionary Algorithms (EAs) employ random or simplistic selection methods, +limiting their exploration of solution spaces and convergence to optimal +solutions. The randomness in performing crossover or mutations may limit the +model's ability to evolve efficiently. This paper introduces Preference-Aligned +Individual Reciprocity (PAIR), a novel selection approach leveraging Large +Language Models to emulate human-like mate selection, thereby introducing +intelligence to the pairing process in EAs. PAIR prompts an LLM to evaluate +individuals within a population based on genetic diversity, fitness level, and +crossover compatibility, guiding more informed pairing decisions. We evaluated +PAIR against a baseline method called LLM-driven EA (LMEA), published recently. +Results indicate that PAIR significantly outperforms LMEA across various TSP +instances, achieving lower optimality gaps and improved convergence. This +performance is especially noticeable when combined with the flash thinking +model, demonstrating increased population diversity to escape local optima. In +general, PAIR provides a new strategy in the area of in-context learning for +LLM-driven selection in EAs via sophisticated preference modelling, paving the +way for improved solutions and further studies into LLM-guided optimization. + +
+
+
+
+
+ + ☆ Prediction of Halo Coronal Mass Ejections Using SDO/HMI Vector Magnetic + Data Products and a Transformer Model + + +
+ We present a transformer model, named DeepHalo, to predict the occurrence of +halo coronal mass ejections (CMEs). Our model takes as input an active region +(AR) and a profile, where the profile contains a time series of data samples in +the AR that are collected 24 hours before the beginning of a day, and predicts +whether the AR would produce a halo CME during that day. Each data sample +contains physical parameters, or features, derived from photospheric vector +magnetic field data taken by the Helioseismic and Magnetic Imager (HMI) on +board the Solar Dynamics Observatory (SDO). We survey and match CME events in +the Space Weather Database Of Notification, Knowledge, Information (DONKI) and +Large Angle and Spectrometric Coronagraph (LASCO) CME Catalog, and compile a +list of CMEs including halo CMEs and non-halo CMEs associated with ARs in the +period between November 2010 and August 2023. We use the information gathered +above to build the labels (positive versus negative) of the data samples and +profiles at hand, where the labels are needed for machine learning. +Experimental results show that DeepHalo with a true skill statistics (TSS) +score of 0.907 outperforms a closely related long short-term memory network +with a TSS score of 0.821. To our knowledge, this is the first time that the +transformer model has been used for halo CME prediction. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ☆ Convergence Rates for Softmax Gating Mixture of Experts ICML 2024 + + +
+ Mixture of experts (MoE) has recently emerged as an effective framework to +advance the efficiency and scalability of machine learning models by softly +dividing complex tasks among multiple specialized sub-models termed experts. +Central to the success of MoE is an adaptive softmax gating mechanism which +takes responsibility for determining the relevance of each expert to a given +input and then dynamically assigning experts their respective weights. Despite +its widespread use in practice, a comprehensive study on the effects of the +softmax gating on the MoE has been lacking in the literature. To bridge this +gap in this paper, we perform a convergence analysis of parameter estimation +and expert estimation under the MoE equipped with the standard softmax gating +or its variants, including a dense-to-sparse gating and a hierarchical softmax +gating, respectively. Furthermore, our theories also provide useful insights +into the design of sample-efficient expert structures. In particular, we +demonstrate that it requires polynomially many data points to estimate experts +satisfying our proposed \emph{strong identifiability} condition, namely a +commonly used two-layer feed-forward network. In stark contrast, estimating +linear experts, which violate the strong identifiability condition, +necessitates exponentially many data points as a result of intrinsic parameter +interactions expressed in the language of partial differential equations. All +the theoretical results are substantiated with a rigorous guarantee. + +
+
+ comment: Section 2 of this work comes from our previous paper titled "On Least + Square Estimation in Softmax Gating Mixture of Experts" and published at the + ICML 2024 +
+
+
+
+
+ + ☆ NodeReg: Mitigating the Imbalance and Distribution Shift Effects in + Semi-Supervised Node Classification via Norm Consistency + + +
+ Aggregating information from neighboring nodes benefits graph neural networks +(GNNs) in semi-supervised node classification tasks. Nevertheless, this +mechanism also renders nodes susceptible to the influence of their neighbors. +For instance, this will occur when the neighboring nodes are imbalanced or the +neighboring nodes contain noise, which can even affect the GNN's ability to +generalize out of distribution. We find that ensuring the consistency of the +norm for node representations can significantly reduce the impact of these two +issues on GNNs. To this end, we propose a regularized optimization method +called NodeReg that enforces the consistency of node representation norms. This +method is simple but effective and satisfies Lipschitz continuity, thus +facilitating stable optimization and significantly improving semi-supervised +node classification performance under the above two scenarios. To illustrate, +in the imbalance scenario, when training a GCN with an imbalance ratio of 0.1, +NodeReg outperforms the most competitive baselines by 1.4%-25.9% in F1 score +across five public datasets. Similarly, in the distribution shift scenario, +NodeReg outperforms the most competitive baseline by 1.4%-3.1% in accuracy. + +
+
+
+
+
+ + ☆ An Analytical Theory of Power Law Spectral Bias in the Learning Dynamics + of Diffusion Models + + +
+ We developed an analytical framework for understanding how the learned +distribution evolves during diffusion model training. Leveraging the Gaussian +equivalence principle, we derived exact solutions for the gradient-flow +dynamics of weights in one- or two-layer linear denoiser settings with +arbitrary data. Remarkably, these solutions allowed us to derive the generated +distribution in closed form and its KL divergence through training. These +analytical results expose a pronounced power-law spectral bias, i.e., for +weights and distributions, the convergence time of a mode follows an inverse +power law of its variance. Empirical experiments on both Gaussian and image +datasets demonstrate that the power-law spectral bias remains robust even when +using deeper or convolutional architectures. Our results underscore the +importance of the data covariance in dictating the order and rate at which +diffusion models learn different modes of the data, providing potential +explanations for why earlier stopping could lead to incorrect details in image +generative models. + +
+
+ comment: 50 pages, 10 figures. Preprint +
+
+
+
+
+ + ☆ Directly Follows Graphs Go Predictive Process Monitoring With Graph + Neural Networks + + +
+ In the past years, predictive process monitoring (PPM) techniques based on +artificial neural networks have evolved as a method to monitor the future +behavior of business processes. Existing approaches mostly focus on +interpreting the processes as sequences, so-called traces, and feeding them to +neural architectures designed to operate on sequential data such as recurrent +neural networks (RNNs) or transformers. In this study, we investigate an +alternative way to perform PPM: by transforming each process in its +directly-follows-graph (DFG) representation we are able to apply graph neural +networks (GNNs) for the prediction tasks. By this, we aim to develop models +that are more suitable for complex processes that are long and contain an +abundance of loops. In particular, we present different ways to create DFG +representations depending on the particular GNN we use. The tested GNNs range +from classical node-based to novel edge-based architectures. Further, we +investigate the possibility of using multi-graphs. By these steps, we aim to +design graph representations that minimize the information loss when +transforming traces into graphs. + +
+
+ comment: 10 pages, 4 figures, 3 tables +
+
+
+
+
+ + ☆ Online Bidding under RoS Constraints without Knowing the Value + + +
+ We consider the problem of bidding in online advertising, where an advertiser +aims to maximize value while adhering to budget and Return-on-Spend (RoS) +constraints. Unlike prior work that assumes knowledge of the value generated by +winning each impression ({e.g.,} conversions), we address the more realistic +setting where the advertiser must simultaneously learn the optimal bidding +strategy and the value of each impression opportunity. This introduces a +challenging exploration-exploitation dilemma: the advertiser must balance +exploring different bids to estimate impression values with exploiting current +knowledge to bid effectively. To address this, we propose a novel Upper +Confidence Bound (UCB)-style algorithm that carefully manages this trade-off. +Via a rigorous theoretical analysis, we prove that our algorithm achieves +$\widetilde{O}(\sqrt{T\log(|\mathcal{B}|T)})$ regret and constraint violation, +where $T$ is the number of bidding rounds and $\mathcal{B}$ is the domain of +possible bids. This establishes the first optimal regret and constraint +violation bounds for bidding in the online setting with unknown impression +values. Moreover, our algorithm is computationally efficient and simple to +implement. We validate our theoretical findings through experiments on +synthetic data, demonstrating that our algorithm exhibits strong empirical +performance compared to existing approaches. + +
+
+
+
+
+ + ♻ ☆ Personalize Your LLM: Fake it then Align it NAACL 2025 + + +
+ Personalizing large language models (LLMs) is essential for delivering +tailored interactions that improve user experience. Many existing +personalization methods require fine-tuning LLMs for each user, rendering them +prohibitively expensive for widespread adoption. Although retrieval-based +approaches offer a more compute-efficient alternative, they still depend on +large, high-quality datasets that are not consistently available for all users. +To address this challenge, we propose CHAMELEON, a scalable and efficient +personalization approach that uses (1) self-generated personal preference data +and (2) representation editing to enable quick and cost-effective +personalization. Our experiments on various tasks, including those from the +LaMP personalization benchmark, show that CHAMELEON efficiently adapts models +to personal preferences, improving instruction-tuned models and outperforms two +personalization baselines by an average of 40% across two model architectures. + +
+
+ comment: NAACL 2025 Findings +
+
+
+
+
+ + ♻ ☆ On Discriminative Probabilistic Modeling for Self-Supervised + Representation Learning ICLR 2025 + + +
+ We study the discriminative probabilistic modeling on a continuous domain for +the data prediction task of (multimodal) self-supervised representation +learning. To address the challenge of computing the integral in the partition +function for each anchor data, we leverage the multiple importance sampling +(MIS) technique for robust Monte Carlo integration, which can recover +InfoNCE-based contrastive loss as a special case. Within this probabilistic +modeling framework, we conduct generalization error analysis to reveal the +limitation of current InfoNCE-based contrastive loss for self-supervised +representation learning and derive insights for developing better approaches by +reducing the error of Monte Carlo integration. To this end, we propose a novel +non-parametric method for approximating the sum of conditional probability +densities required by MIS through convex optimization, yielding a new +contrastive objective for self-supervised representation learning. Moreover, we +design an efficient algorithm for solving the proposed objective. We +empirically compare our algorithm to representative baselines on the +contrastive image-language pretraining task. Experimental results on the CC3M +and CC12M datasets demonstrate the superior overall performance of our +algorithm. Our code is available at https://github.com/bokun-wang/NUCLR. + +
+
+ comment: To appear in ICLR 2025 +
+
+
+
+
+ + ♻ ☆ PARAMANU-GANITA: Can Small Math Language Models Rival with Large + Language Models on Mathematical Reasoning? + + +
+ In this paper, we study whether domain specific pretraining of small +generative language models (SLM) from scratch with domain specialized tokenizer +and Chain-of-Thought (CoT) instruction fine-tuning results in competitive +performance on mathematical reasoning compared to LLMs? Secondly, whether this +approach is environmentally sustainable, highly cost efficient? To address +these research questions, we present Paramanu-Ganita, a 208 million-parameter +novel decoder-only Auto Regressive SLM on mathematics. We performed pretraining +from scratch on 31.5 billion tokens for 170 A100 hours using a context size of +4096 on a mixed mathematical corpus consisting of web pages, source code, +textbooks, CoT templatised StackOverflow QA pairs, and mathematical lecture +notes in LaTeX curated by us. We also trained a math and code specialised BPE +tokenizer. We proposed and performed CoT instruction fine-tuning of +Paramanu-Ganita on the MetaMathQA dataset. Our model Paramanu-Ganita, despite +being 34 times smaller than the 7B LLMs, outperforms generalist LLMs by +approximately 30% points, and even math-specialised LLMs by 3-23% points in +GSM8K test accuracy metric. On MATH benchmark, Paramanu-Ganita outperformed the +various models by 6-8% points. On benchmarks like LogiQA, MMLU (high school, +college level), and competitive exams level, AGIEVAL (AQuA-RAT, SAT-Math), +Paramanu-Ganita outperformed others by 1-4%. Our model is available at +https://huggingface.co/gyanai/paramanu-ganita-208M-hf . + +
+
+
+
+
+ + ♻ ☆ Reusing Historical Trajectories in Natural Policy Gradient via + Importance Sampling: Convergence and Convergence Rate + + +
+ Reinforcement learning provides a mathematical framework for learning-based +control, whose success largely depends on the amount of data it can utilize. +The efficient utilization of historical trajectories obtained from previous +policies is essential for expediting policy optimization. Empirical evidence +has shown that policy gradient methods based on importance sampling work well. +However, existing literature often neglect the interdependence between +trajectories from different iterations, and the good empirical performance +lacks a rigorous theoretical justification. In this paper, we study a variant +of the natural policy gradient method with reusing historical trajectories via +importance sampling. We show that the bias of the proposed estimator of the +gradient is asymptotically negligible, the resultant algorithm is convergent, +and reusing past trajectories helps improve the convergence rate. We further +apply the proposed estimator to popular policy optimization algorithms such as +trust region policy optimization. Our theoretical results are verified on +classical benchmarks. + +
+
+
+
+
+ + ♻ ☆ Neural DNF-MT: A Neuro-symbolic Approach for Learning Interpretable and + Editable Policies AAMAS 2025 + + +
+ Although deep reinforcement learning has been shown to be effective, the +model's black-box nature presents barriers to direct policy interpretation. To +address this problem, we propose a neuro-symbolic approach called neural DNF-MT +for end-to-end policy learning. The differentiable nature of the neural DNF-MT +model enables the use of deep actor-critic algorithms for training. At the same +time, its architecture is designed so that trained models can be directly +translated into interpretable policies expressed as standard (bivalent or +probabilistic) logic programs. Moreover, additional layers can be included to +extract abstract features from complex observations, acting as a form of +predicate invention. The logic representations are highly interpretable, and we +show how the bivalent representations of deterministic policies can be edited +and incorporated back into a neural model, facilitating manual intervention and +adaptation of learned policies. We evaluate our approach on a range of tasks +requiring learning deterministic or stochastic behaviours from various forms of +observations. Our empirical results show that our neural DNF-MT model performs +at the level of competing black-box methods whilst providing interpretable +policies. + +
+
+ comment: AAMAS 2025 (with Appendix) +
+
+
+
+
+ + ♻ ☆ Beyond Matryoshka: Revisiting Sparse Coding for Adaptive Representation + + +
+ Many large-scale systems rely on high-quality deep representations +(embeddings) to facilitate tasks like retrieval, search, and generative +modeling. Matryoshka Representation Learning (MRL) recently emerged as a +solution for adaptive embedding lengths, but it requires full model retraining +and suffers from noticeable performance degradations at short lengths. In this +paper, we show that sparse coding offers a compelling alternative for achieving +adaptive representation with minimal overhead and higher fidelity. We propose +Contrastive Sparse Representation (CSR), a method that sparsifies pre-trained +embeddings into a high-dimensional but selectively activated feature space. By +leveraging lightweight autoencoding and task-aware contrastive objectives, CSR +preserves semantic quality while allowing flexible, cost-effective inference at +different sparsity levels. Extensive experiments on image, text, and multimodal +benchmarks demonstrate that CSR consistently outperforms MRL in terms of both +accuracy and retrieval speed-often by large margins-while also cutting training +time to a fraction of that required by MRL. Our results establish sparse coding +as a powerful paradigm for adaptive representation learning in real-world +applications where efficiency and fidelity are both paramount. Code is +available at https://github.com/neilwen987/CSR_Adaptive_Rep + +
+
+ comment: A novel sparse coding framework designed for learning adaptive + representation +
+
+
+
+
+ + ♻ ☆ CTC-DRO: Robust Optimization for Reducing Language Disparities in Speech + Recognition + + +
+ Modern deep learning models often achieve high overall performance, but +consistently fail on specific subgroups. Group distributionally robust +optimization (group DRO) addresses this problem by minimizing the worst-group +loss, but it fails when group losses misrepresent performance differences +between groups. This is common in domains like speech, where the widely used +connectionist temporal classification (CTC) loss scales with input length and +varies with linguistic and acoustic properties, leading to spurious differences +between group losses. We present CTC-DRO, which addresses the shortcomings of +the group DRO objective by smoothing the group weight update to prevent +overemphasis on consistently high-loss groups, while using input length-matched +batching to mitigate CTC's scaling issues. We evaluate CTC-DRO on the task of +multilingual automatic speech recognition (ASR) across five language sets from +the ML-SUPERB 2.0 benchmark. CTC-DRO consistently outperforms group DRO and +CTC-based baseline models, reducing the worst-language error by up to 47.1% and +the average error by up to 32.9%. CTC-DRO can be applied to ASR with minimal +computational costs, and offers the potential for reducing group disparities in +other domains with similar challenges. + +
+
+
+
+
+ + ♻ ☆ Bonsai: Gradient-free Graph Distillation for Node Classification + + +
+ Graph distillation has emerged as a promising avenue to enable scalable +training of GNNs by compressing the training dataset while preserving essential +graph characteristics. Our study uncovers significant shortcomings in current +graph distillation techniques. First, the majority of the algorithms +paradoxically require training on the full dataset to perform distillation. +Second, due to their gradient-emulating approach, these methods require fresh +distillation for any change in hyperparameters or GNN architecture, limiting +their flexibility and reusability. Finally, they fail to achieve substantial +size reduction due to synthesizing fully-connected, edge-weighted graphs. To +address these challenges, we present Bonsai, a novel graph distillation method +empowered by the observation that \textit{computation trees} form the +fundamental processing units of message-passing GNNs. Bonsai distills datasets +by encoding a careful selection of \textit{exemplar} trees that maximize the +representation of all computation trees in the training set. This unique +approach imparts Bonsai as the first linear-time, model-agnostic graph +distillation algorithm for node classification that outperforms existing +baselines across $6$ real-world datasets on accuracy, while being $22$ times +faster on average. Bonsai is grounded in rigorous mathematical guarantees on +the adopted approximation strategies making it robust to GNN architectures, +datasets, and parameters. + +
+
+
+
+
+ + ♻ ☆ Statistical Advantages of Perturbing Cosine Router in Mixture of Experts ICLR 2025 + + +
+ The cosine router in Mixture of Experts (MoE) has recently emerged as an +attractive alternative to the conventional linear router. Indeed, the cosine +router demonstrates favorable performance in image and language tasks and +exhibits better ability to mitigate the representation collapse issue, which +often leads to parameter redundancy and limited representation potentials. +Despite its empirical success, a comprehensive analysis of the cosine router in +MoE has been lacking. Considering the least square estimation of the cosine +routing MoE, we demonstrate that due to the intrinsic interaction of the model +parameters in the cosine router via some partial differential equations, +regardless of the structures of the experts, the estimation rates of experts +and model parameters can be as slow as $\mathcal{O}(1/\log^{\tau}(n))$ where +$\tau > 0$ is some constant and $n$ is the sample size. Surprisingly, these +pessimistic non-polynomial convergence rates can be circumvented by the widely +used technique in practice to stabilize the cosine router -- simply adding +noises to the $\ell^2$-norms in the cosine router, which we refer to as +\textit{perturbed cosine router}. Under the strongly identifiable settings of +the expert functions, we prove that the estimation rates for both the experts +and model parameters under the perturbed cosine routing MoE are significantly +improved to polynomial rates. Finally, we conduct extensive simulation studies +in both synthetic and real data settings to empirically validate our +theoretical results. + +
+
+ comment: Accepted to ICLR 2025 +
+
+
+
+
+ + ♻ ☆ What to align in multimodal contrastive learning? ICLR 2025 + + +
+ Humans perceive the world through multisensory integration, blending the +information of different modalities to adapt their behavior. Contrastive +learning offers an appealing solution for multimodal self-supervised learning. +Indeed, by considering each modality as a different view of the same entity, it +learns to align features of different modalities in a shared representation +space. However, this approach is intrinsically limited as it only learns shared +or redundant information between modalities, while multimodal interactions can +arise in other ways. In this work, we introduce CoMM, a Contrastive MultiModal +learning strategy that enables the communication between modalities in a single +multimodal space. Instead of imposing cross- or intra- modality constraints, we +propose to align multimodal representations by maximizing the mutual +information between augmented versions of these multimodal features. Our +theoretical analysis shows that shared, synergistic and unique terms of +information naturally emerge from this formulation, allowing us to estimate +multimodal interactions beyond redundancy. We test CoMM both in a controlled +and in a series of real-world settings: in the former, we demonstrate that CoMM +effectively captures redundant, unique and synergistic information between +modalities. In the latter, CoMM learns complex multimodal interactions and +achieves state-of-the-art results on the seven multimodal benchmarks. Code is +available at https://github.com/Duplums/CoMM + +
+
+ comment: ICLR 2025, 25 pages +
+
+
+
+
+ + ♻ ☆ CycleResearcher: Improving Automated Research via Automated Review ICLR 2025 + + +
+ The automation of scientific discovery has been a long-standing goal within +the research community, driven by the potential to accelerate knowledge +creation. While significant progress has been made using commercial large +language models (LLMs) as research assistants or idea generators, the +possibility of automating the entire research process with open-source LLMs +remains largely unexplored. This paper explores the feasibility of using +open-source post-trained LLMs as autonomous agents capable of performing the +full cycle of automated research and review, from literature review and +manuscript preparation to peer review and paper refinement. Our iterative +preference training framework consists of CycleResearcher, which conducts +research tasks, and CycleReviewer, which simulates the peer review process, +providing iterative feedback via reinforcement learning. To train these models, +we develop two new datasets, Review-5k and Research-14k, reflecting real-world +machine learning research and peer review dynamics. Our results demonstrate +that CycleReviewer achieves promising performance with a 26.89\% reduction in +mean absolute error (MAE) compared to individual human reviewers in predicting +paper scores, indicating the potential of LLMs to effectively assist +expert-level research evaluation. In research, the papers generated by the +CycleResearcher model achieved a score of 5.36 in simulated peer reviews, +showing some competitiveness in terms of simulated review scores compared to +the preprint level of 5.24 from human experts, while still having room for +improvement compared to the accepted paper level of 5.69. This work represents +a significant step toward fully automated scientific inquiry, providing ethical +safeguards and exploring AI-driven research capabilities. The code, dataset and +model weight are released at https://wengsyx.github.io/Researcher/ + +
+
+ comment: Accept in ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Unveiling Simplicities of Attention: Adaptive Long-Context Head + Identification + + +
+ The ability to process long contexts is crucial for many natural language +processing tasks, yet it remains a significant challenge. While substantial +progress has been made in enhancing the efficiency of attention mechanisms, +there is still a gap in understanding how attention heads function in +long-context settings. In this paper, we observe that while certain heads +consistently attend to local information only, others swing between attending +to local and long-context information depending on the query. This raises the +question: can we identify which heads require long-context information to +predict the next token accurately? We demonstrate that it's possible to predict +which heads are crucial for long-context processing using only local keys. The +core idea here is to exploit a simple model for the long-context scores via +second moment approximations. These findings unveil simple properties of +attention in the context of long sequences, and open the door to potentially +significant gains in efficiency. + +
+
+
+
+
+ + ♻ ☆ Generative Adversarial Networks for High-Dimensional Item Factor + Analysis: A Deep Adversarial Learning Algorithm + + +
+ Advances in deep learning and representation learning have transformed item +factor analysis (IFA) in the item response theory (IRT) literature by enabling +more efficient and accurate parameter estimation. Variational Autoencoders +(VAEs) have been one of the most impactful techniques in modeling +high-dimensional latent variables in this context. However, the limited +expressiveness of the inference model based on traditional VAEs can still +hinder the estimation performance. We introduce Adversarial Variational Bayes +(AVB) algorithms as an improvement to VAEs for IFA with improved flexibility +and accuracy. By bridging the strengths of VAEs and Generative Adversarial +Networks (GANs), AVB incorporates an auxiliary discriminator network to reframe +the estimation process as a two-player adversarial game and removes the +restrictive assumption of standard normal distributions in the inference model. +Theoretically, AVB can achieve similar or higher likelihood compared to VAEs. A +further enhanced algorithm, Importance-weighted Adversarial Variational Bayes +(IWAVB) is proposed and compared with Importance-weighted Autoencoders (IWAE). +In an exploratory analysis of empirical data, IWAVB demonstrated superior +expressiveness by achieving a higher likelihood compared to IWAE. In +confirmatory analysis with simulated data, IWAVB achieved similar mean-square +error results to IWAE while consistently achieving higher likelihoods. When +latent variables followed a multimodal distribution, IWAVB outperformed IWAE. +With its innovative use of GANs, IWAVB is shown to have the potential to extend +IFA to handle large-scale data, facilitating the potential integration of +psychometrics and multimodal data analysis. + +
+
+
+
+
+ + ♻ ☆ MMBind: Unleashing the Potential of Distributed and Heterogeneous Data + for Multimodal Learning in IoT + + +
+ Multimodal sensing systems are increasingly prevalent in various real-world +applications. Most existing multimodal learning approaches heavily rely on +training with a large amount of synchronized, complete multimodal data. +However, such a setting is impractical in real-world IoT sensing applications +where data is typically collected by distributed nodes with heterogeneous data +modalities, and is also rarely labeled. In this paper, we propose MMBind, a new +data binding approach for multimodal learning on distributed and heterogeneous +IoT data. The key idea of MMBind is to construct a pseudo-paired multimodal +dataset for model training by binding data from disparate sources and +incomplete modalities through a sufficiently descriptive shared modality. We +also propose a weighted contrastive learning approach to handle domain shifts +among disparate data, coupled with an adaptive multimodal learning architecture +capable of training models with heterogeneous modality combinations. +Evaluations on ten real-world multimodal datasets highlight that MMBind +outperforms state-of-the-art baselines under varying degrees of data +incompleteness and domain shift, and holds promise for advancing multimodal +foundation model training in IoT applications\footnote (The source code is +available via https://github.com/nesl/multimodal-bind). + +
+
+
+
+
+ + ♻ ☆ Exploring Code Language Models for Automated HLS-based Hardware + Generation: Benchmark, Infrastructure and Analysis SP + + +
+ Recent advances in code generation have illuminated the potential of +employing large language models (LLMs) for general-purpose programming +languages such as Python and C++, opening new opportunities for automating +software development and enhancing programmer productivity. The potential of +LLMs in software programming has sparked significant interest in exploring +automated hardware generation and automation. Although preliminary endeavors +have been made to adopt LLMs in generating hardware description languages +(HDLs), several challenges persist in this direction. First, the volume of +available HDL training data is substantially smaller compared to that for +software programming languages. Second, the pre-trained LLMs, mainly tailored +for software code, tend to produce HDL designs that are more error-prone. +Third, the generation of HDL requires a significantly higher number of tokens +compared to software programming, leading to inefficiencies in cost and energy +consumption. To tackle these challenges, this paper explores leveraging LLMs to +generate High-Level Synthesis (HLS)-based hardware design. Although code +generation for domain-specific programming languages is not new in the +literature, we aim to provide experimental results, insights, benchmarks, and +evaluation infrastructure to investigate the suitability of HLS over low-level +HDLs for LLM-assisted hardware design generation. To achieve this, we first +finetune pre-trained models for HLS-based hardware generation, using a +collected dataset with text prompts and corresponding reference HLS designs. An +LLM-assisted framework is then proposed to automate end-to-end hardware code +generation, which also investigates the impact of chain-of-thought and feedback +loops promoting techniques on HLS-design generation. Limited by the timeframe +of this research, we plan to evaluate more advanced reasoning models in the +future. + +
+
+ comment: Paper accepted by ASP-DAC'25 +
+
+
+
+
+ + ♻ ☆ One-Shot Imitation under Mismatched Execution + + +
+ Human demonstrations as prompts are a powerful way to program robots to do +long-horizon manipulation tasks. However, translating these demonstrations into +robot-executable actions presents significant challenges due to execution +mismatches in movement styles and physical capabilities. Existing methods +either depend on human-robot paired data, which is infeasible to scale, or rely +heavily on frame-level visual similarities that often break down in practice. +To address these challenges, we propose RHyME, a novel framework that +automatically aligns human and robot task executions using optimal transport +costs. Given long-horizon robot demonstrations, RHyME synthesizes semantically +equivalent human videos by retrieving and composing short-horizon human clips. +This approach facilitates effective policy training without the need for paired +data. RHyME successfully imitates a range of cross-embodiment demonstrators, +both in simulation and with a real human hand, achieving over 50\% increase in +task success compared to previous methods. We release our code and datasets at +https://portal-cornell.github.io/rhyme/. + +
+
+
+
+
+ + ♻ ☆ MDP Geometry, Normalization and Reward Balancing Solvers AISTATS 2025 + + +
+ We present a new geometric interpretation of Markov Decision Processes (MDPs) +with a natural normalization procedure that allows us to adjust the value +function at each state without altering the advantage of any action with +respect to any policy. This advantage-preserving transformation of the MDP +motivates a class of algorithms which we call Reward Balancing, which solve +MDPs by iterating through these transformations, until an approximately optimal +policy can be trivially found. We provide a convergence analysis of several +algorithms in this class, in particular showing that for MDPs for unknown +transition probabilities we can improve upon state-of-the-art sample complexity +results. + +
+
+ comment: AISTATS 2025 camera-ready version +
+
+
+
+
+ + ♻ ☆ Capability-Aware Shared Hypernetworks for Flexible Heterogeneous + Multi-Robot Coordination + + +
+ Recent advances have enabled heterogeneous multi-robot teams to learn complex +and effective coordination. However, existing architectural designs that +support heterogeneous teams tend to force a trade-off between expressivity and +efficiency. Some attempt to encode diverse behaviors within a single shared +architecture by appending the input with an ID unique to each robot or robot +type. These designs improve sample and parameter efficiency but tend to limit +behavioral diversity. Others use a separate policy for each robot, enabling +greater diversity at the cost of efficiency and generalization. We view these +two designs as ends of a spectrum and explore a middle-ground approach that +enables efficient learning of diverse behaviors. Inspired by work in transfer +learning and meta RL, and building upon prior work in trait-based task +allocation, we propose Capability-Aware Shared Hypernetworks (CASH), a +general-purpose soft weight sharing architecture that uses hypernetworks to +enable a single architecture to dynamically adapt to each robot and the current +context. Intuitively, CASH encodes shared decision making strategies that can +be adapted to each robot based on local observations and the robots' individual +and collective capabilities (e.g., speed and payload). CASH explicitly captures +the impact of capabilities on collective behavior, enabling zero-shot +generalization to unseen robots or team compositions. We conducted experiments +across four heterogeneous coordination tasks and three learning paradigms +(imitation learning, value-based, and policy-gradient RL) using SOTA +multi-robot simulation (JaxMARL) and hardware (Robotarium) platforms. Across +all conditions, CASH generates appropriately diverse behaviors and outperforms +baseline architectures in task performance and sample efficiency during +training and zero-shot generalization while utilizing 60%-80% fewer learnable +parameters. + +
+
+ comment: 16 pages, 8 figures, equal authorship between Kevin Fu and Shalin + Jain +
+
+
+
+
+ + ♻ ☆ Dashing for the Golden Snitch: Multi-Drone Time-Optimal Motion Planning + with Multi-Agent Reinforcement Learning + + +
+ Recent innovations in autonomous drones have facilitated time-optimal flight +in single-drone configurations, and enhanced maneuverability in multi-drone +systems by applying optimal control and learning-based methods. However, few +studies have achieved time-optimal motion planning for multi-drone systems, +particularly during highly agile maneuvers or in dynamic scenarios. This paper +presents a decentralized policy network using multi-agent reinforcement +learning for time-optimal multi-drone flight. To strike a balance between +flight efficiency and collision avoidance, we introduce a soft collision-free +mechanism inspired by optimization-based methods. By customizing PPO in a +centralized training, decentralized execution (CTDE) fashion, we unlock higher +efficiency and stability in training while ensuring lightweight implementation. +Extensive simulations show that, despite slight performance trade-offs compared +to single-drone systems, our multi-drone approach maintains near-time-optimal +performance with a low collision rate. Real-world experiments validate our +method, with two quadrotors using the same network as in simulation achieving a +maximum speed of 13.65 m/s and a maximum body rate of 13.4 rad/s in a 5.5 m * +5.5 m * 2.0 m space across various tracks, relying entirely on onboard +computation. + +
+
+ comment: v2: 7 pages, 6 figures; terminology corrected, algorithmic and + equation descriptions revised, references added +
+
+
+
+
+ + ♻ ☆ Beyond Canonicalization: How Tensorial Messages Improve Equivariant + Message Passing ICLR 2025 + + +
+ In numerous applications of geometric deep learning, the studied systems +exhibit spatial symmetries and it is desirable to enforce these. For the +symmetry of global rotations and reflections, this means that the model should +be equivariant with respect to the transformations that form the group of +$\mathrm O(d)$. While many approaches for equivariant message passing require +specialized architectures, including non-standard normalization layers or +non-linearities, we here present a framework based on local reference frames +("local canonicalization") which can be integrated with any architecture +without restrictions. We enhance equivariant message passing based on local +canonicalization by introducing tensorial messages to communicate geometric +information consistently between different local coordinate frames. Our +framework applies to message passing on geometric data in Euclidean spaces of +arbitrary dimension. We explicitly show how our approach can be adapted to make +a popular existing point cloud architecture equivariant. We demonstrate the +superiority of tensorial messages and achieve state-of-the-art results on +normal vector regression and competitive results on other standard 3D point +cloud tasks. + +
+
+ comment: To be published in proceedings of ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Incentivizing Truthful Collaboration in Heterogeneous Federated Learning + + +
+ Federated learning (FL) is a distributed collaborative learning method, where +multiple clients learn together by sharing gradient updates instead of raw +data. However, it is well-known that FL is vulnerable to manipulated updates +from clients. In this work we study the impact of data heterogeneity on +clients' incentives to manipulate their updates. First, we present +heterogeneous collaborative learning scenarios where a client can modify their +updates to be better off, and show that these manipulations can lead to +diminishing model performance. To prevent such modifications, we formulate a +game in which clients may misreport their gradient updates in order to "steer" +the server model to their advantage. We develop a payment rule that provably +disincentivizes sending modified updates under the FedSGD protocol. We derive +explicit bounds on the clients' payments and the convergence rate of the global +model, which allows us to study the trade-off between heterogeneity, payments +and convergence. Finally, we provide an experimental evaluation of the +effectiveness of our payment rule in the FedSGD, median-based aggregation +FedSGD and FedAvg protocols on three tasks in computer vision and natural +language processing. In all cases we find that our scheme successfully +disincentivizes modifications. + +
+
+ comment: 29 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ On the Utility of Equivariance and Symmetry Breaking in Deep Learning + Architectures on Point Clouds + + +
+ This paper explores the key factors that influence the performance of models +working with point clouds, across different tasks of varying geometric +complexity. In this work, we explore the trade-offs between flexibility and +weight-sharing introduced by equivariant layers, assessing when equivariance +boosts or detracts from performance. It is often argued that providing more +information as input improves a model's performance. However, if this +additional information breaks certain properties, such as $\SE(3)$ +equivariance, does it remain beneficial? We identify the key aspects of +equivariant and non-equivariant architectures that drive success in different +tasks by benchmarking them on segmentation, regression, and generation tasks +across multiple datasets with increasing complexity. We observe a positive +impact of equivariance, which becomes more pronounced with increasing task +complexity, even when strict equivariance is not required. + +
+
+ comment: 19 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Efficient Neural SDE Training using Wiener-Space Cubature + + +
+ A neural stochastic differential equation (SDE) is an SDE with drift and +diffusion terms parametrized by neural networks. The training procedure for +neural SDEs consists of optimizing the SDE vector field (neural network) +parameters to minimize the expected value of an objective functional on +infinite-dimensional path-space. Existing training techniques focus on methods +to efficiently compute path-wise gradients of the objective functional with +respect to these parameters, then pair this with Monte-Carlo simulation to +estimate the expectation, and stochastic gradient descent to optimize. In this +work we introduce a novel training technique which bypasses and improves upon +Monte-Carlo simulation; we extend results in the theory of Wiener-space +cubature to approximate the expected objective functional by a weighted sum of +deterministic ODE solutions. This allows us to compute gradients by efficient +ODE adjoint methods. Furthermore, we exploit a high-order recombination scheme +to drastically reduce the number of ODE solutions necessary to achieve a +reasonable approximation. We show that this Wiener-space cubature approach can +surpass the O(1/sqrt(n)) rate of Monte-Carlo simulation, or the O(log(n)/n) +rate of quasi-Monte-Carlo, to achieve a O(1/n) rate under reasonable +assumptions. + +
+
+
+
+
+ + ♻ ☆ LLMs can be Dangerous Reasoners: Analyzing-based Jailbreak Attack on + Large Language Models + + +
+ The rapid development of Large Language Models (LLMs) has brought significant +advancements across various tasks. However, despite these achievements, LLMs +still exhibit inherent safety vulnerabilities, especially when confronted with +jailbreak attacks. Existing jailbreak methods suffer from two main limitations: +reliance on complicated prompt engineering and iterative optimization, which +lead to low attack success rate (ASR) and attack efficiency (AE). In this work, +we propose an efficient jailbreak attack method, Analyzing-based Jailbreak +(ABJ), which leverages the advanced reasoning capability of LLMs to +autonomously generate harmful content, revealing their underlying safety +vulnerabilities during complex reasoning process. We conduct comprehensive +experiments on ABJ across various open-source and closed-source LLMs. In +particular, ABJ achieves high ASR (82.1% on GPT-4o-2024-11-20) with exceptional +AE among all target LLMs, showcasing its remarkable attack effectiveness, +transferability, and efficiency. Our findings underscore the urgent need to +prioritize and improve the safety of LLMs to mitigate the risks of misuse. + +
+
+
+
+
+ + ♻ ☆ Online Scheduling for LLM Inference with KV Cache Constraints + + +
+ Large Language Model (LLM) inference, where a trained model generates text +one word at a time in response to user prompts, is a computationally intensive +process requiring efficient scheduling to optimize latency and resource +utilization. A key challenge in LLM inference is the management of the +Key-Value (KV) cache, which reduces redundant computations but introduces +memory constraints. In this work, we model LLM inference with KV cache +constraints theoretically and propose novel batching and scheduling algorithms +that minimize inference latency while effectively managing the KV cache's +memory. + We analyze both semi-online and fully online scheduling models, and our +results are threefold. First, we provide a polynomial-time algorithm that +achieves exact optimality in terms of average latency in the semi-online prompt +arrival model. Second, in the fully online case with a stochastic prompt +arrival, we introduce an efficient online scheduling algorithm with constant +regret. Third, we prove that no algorithm (deterministic or randomized) can +achieve a constant competitive ratio in fully online adversarial settings. Our +empirical evaluations on a public LLM inference dataset, using the Llama-70B +model on A100 GPUs, show that our approach significantly outperforms benchmark +algorithms used currently in practice, achieving lower latency while reducing +energy consumption. Overall, our results offer a path toward more sustainable +and cost-effective LLM deployment. + +
+
+ comment: Will add a lemma in the proof of Theorem 5.3 to make the statement + and proof more rigorous +
+
+
+
+
+ + ♻ ☆ GSplatLoc: Grounding Keypoint Descriptors into 3D Gaussian Splatting for + Improved Visual Localization + + +
+ Although various visual localization approaches exist, such as scene +coordinate regression and camera pose regression, these methods often struggle +with optimization complexity or limited accuracy. To address these challenges, +we explore the use of novel view synthesis techniques, particularly 3D Gaussian +Splatting (3DGS), which enables the compact encoding of both 3D geometry and +scene appearance. We propose a two-stage procedure that integrates dense and +robust keypoint descriptors from the lightweight XFeat feature extractor into +3DGS, enhancing performance in both indoor and outdoor environments. The coarse +pose estimates are directly obtained via 2D-3D correspondences between the 3DGS +representation and query image descriptors. In the second stage, the initial +pose estimate is refined by minimizing the rendering-based photometric warp +loss. Benchmarking on widely used indoor and outdoor datasets demonstrates +improvements over recent neural rendering-based localization methods, such as +NeRFMatch and PNeRFLoc. + +
+
+ comment: Project website at https://gsplatloc.github.io/ +
+
+
+
+
+ + ♻ ☆ PCM Selector: Penalized Covariate-Mediator Selection Operator for + Evaluating Linear Causal Effects AAAI 2025 + + +
+ For a data-generating process for random variables that can be described with +a linear structural equation model, we consider a situation in which (i) a set +of covariates satisfying the back-door criterion cannot be observed or (ii) +such a set can be observed, but standard statistical estimation methods cannot +be applied to estimate causal effects because of +multicollinearity/high-dimensional data problems. We propose a novel two-stage +penalized regression approach, the penalized covariate-mediator selection +operator (PCM Selector), to estimate the causal effects in such scenarios. +Unlike existing penalized regression analyses, when a set of intermediate +variables is available, PCM Selector provides a consistent or less biased +estimator of the causal effect. In addition, PCM Selector provides a variable +selection procedure for intermediate variables to obtain better estimation +accuracy of the causal effects than does the back-door criterion. + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+ + ♻ ☆ How simple can you go? An off-the-shelf transformer approach to + molecular dynamics + + +
+ Most current neural networks for molecular dynamics (MD) include physical +inductive biases, resulting in specialized and complex architectures. This is +in contrast to most other machine learning domains, where specialist approaches +are increasingly replaced by general-purpose architectures trained on vast +datasets. In line with this trend, several recent studies have questioned the +necessity of architectural features commonly found in MD models, such as +built-in rotational equivariance or energy conservation. In this work, we +contribute to the ongoing discussion by evaluating the performance of an MD +model with as few specialized architectural features as possible. We present a +recipe for MD using an Edge Transformer, an "off-the-shelf'' transformer +architecture that has been minimally modified for the MD domain, termed MD-ET. +Our model implements neither built-in equivariance nor energy conservation. We +use a simple supervised pre-training scheme on $\sim$30 million molecular +structures from the QCML database. Using this "off-the-shelf'' approach, we +show state-of-the-art results on several benchmarks after fine-tuning for a +small number of steps. Additionally, we examine the effects of being only +approximately equivariant and energy conserving for MD simulations, proposing a +novel method for distinguishing the errors resulting from non-equivariance from +other sources of inaccuracies like numerical rounding errors. While our model +exhibits runaway energy increases on larger structures, we show approximately +energy-conserving NVE simulations for a range of small structures. + +
+
+ comment: 21 pages, code at https://github.com/mx-e/simple-md +
+
+
+
+
+ + ♻ ☆ From Sparse Dependence to Sparse Attention: Unveiling How + Chain-of-Thought Enhances Transformer Sample Efficiency + + +
+ Chain-of-thought (CoT) significantly enhances the reasoning performance of +large language models (LLM). While current theoretical studies often attribute +this improvement to increased expressiveness and computational capacity, we +argue that expressiveness is not the primary limitation in the LLM regime, as +current large models will fail on simple tasks. Using a parity-learning setup, +we demonstrate that CoT can substantially improve sample efficiency even when +the representation power is sufficient. Specifically, with CoT, a transformer +can learn the function within polynomial samples, whereas without CoT, the +required sample size is exponential. Additionally, we show that CoT simplifies +the learning process by introducing sparse sequential dependencies among input +tokens, and leads to a sparse and interpretable attention. We validate our +theoretical analysis with both synthetic and real-world experiments, confirming +that sparsity in attention layers is a key factor of the improvement induced by +CoT. + +
+
+ comment: 43 pages,11 figures +
+
+
+
+
+ + ♻ ☆ Handling Spatial-Temporal Data Heterogeneity for Federated Continual + Learning via Tail Anchor CVPR 2025 + + +
+ Federated continual learning (FCL) allows each client to continually update +its knowledge from task streams, enhancing the applicability of federated +learning in real-world scenarios. However, FCL needs to address not only +spatial data heterogeneity between clients but also temporal data heterogeneity +between tasks. In this paper, empirical experiments demonstrate that such +input-level heterogeneity significantly affects the model's internal parameters +and outputs, leading to severe spatial-temporal catastrophic forgetting of +local and previous knowledge. To this end, we propose Federated Tail Anchor +(FedTA) to mix trainable Tail Anchor with the frozen output features to adjust +their position in the feature space, thereby overcoming parameter-forgetting +and output-forgetting. Three novel components are also included: Input +Enhancement for improving the performance of pre-trained models on downstream +tasks; Selective Input Knowledge Fusion for fusion of heterogeneous local +knowledge on the server; and Best Global Prototype Selection for finding the +best anchor point for each class in the feature space. Extensive experiments +demonstrate that FedTA not only outperforms existing FCL methods but also +effectively preserves the relative positions of features. + +
+
+ comment: This paper is accepted by CVPR 2025 +
+
+
+
+
+ + ♻ ☆ Promote, Suppress, Iterate: How Language Models Answer One-to-Many + Factual Queries + + +
+ To answer one-to-many factual queries (e.g., listing cities of a country), a +language model (LM) must simultaneously recall knowledge and avoid repeating +previous answers. How are these two subtasks implemented and integrated +internally? Across multiple datasets and models, we identify a +promote-then-suppress mechanism: the model first recalls all answers, and then +suppresses previously generated ones. Specifically, LMs use both the subject +and previous answer tokens to perform knowledge recall, with attention +propagating subject information and MLPs promoting the answers. Then, attention +attends to and suppresses previous answer tokens, while MLPs amplify the +suppression signal. Our mechanism is corroborated by extensive experimental +evidence: in addition to using early decoding and causal tracing, we analyze +how components use different tokens by introducing both Token Lens, which +decodes aggregated attention updates from specified tokens, and a knockout +method that analyzes changes in MLP outputs after removing attention to +specified tokens. Overall, we provide new insights into how LMs' internal +components interact with different input tokens to support complex factual +recall. Code is available at +https://github.com/Lorenayannnnn/how-lms-answer-one-to-many-factual-queries. + +
+
+
+
+
+ + ♻ ☆ Graph-Aware Isomorphic Attention for Adaptive Dynamics in Transformers + + +
+ We present an approach to modifying Transformer architectures by integrating +graph-aware relational reasoning into the attention mechanism, merging concepts +from graph neural networks and language modeling. Building on the inherent +connection between attention and graph theory, we reformulate the Transformer's +attention mechanism as a graph operation and propose Graph-Aware Isomorphic +Attention. This method leverages advanced graph modeling strategies, including +Graph Isomorphism Networks (GIN) and Principal Neighborhood Aggregation (PNA), +to enrich the representation of relational structures. Our approach captures +complex dependencies and generalizes across tasks, as evidenced by a reduced +generalization gap and improved learning performance. Additionally, we expand +the concept of graph-aware attention to introduce Sparse GIN-Attention, a +fine-tuning approach that employs sparse GINs. By interpreting attention +matrices as sparse adjacency graphs, this technique enhances the adaptability +of pre-trained foundational models with minimal computational overhead, +endowing them with graph-aware capabilities. Sparse GIN-Attention fine-tuning +achieves improved training dynamics and better generalization compared to +alternative methods like low-rank adaption (LoRA). We discuss latent graph-like +structures within traditional attention mechanisms, offering a new lens through +which Transformers can be understood. By evolving Transformers as hierarchical +GIN models for relational reasoning. This perspective suggests profound +implications for foundational model development, enabling the design of +architectures that dynamically adapt to both local and global dependencies. +Applications in bioinformatics, materials science, language modeling, and +beyond could benefit from this synthesis of relational and sequential data +modeling, setting the stage for interpretable and generalizable modeling +strategies. + +
+
+
+
+
+ + ♻ ☆ Zero-Knowledge Proof-based Verifiable Decentralized Machine Learning in + Communication Network: A Comprehensive Survey + + +
+ Over recent decades, machine learning has significantly advanced network +communication, enabling improved decision-making, user behavior analysis, and +fault detection. Decentralized approaches, where participants exchange +computation results instead of raw private data, mitigate these risks but +introduce challenges related to trust and verifiability. A critical issue +arises: How can one ensure the integrity and validity of computation results +shared by other participants? Existing survey articles predominantly address +security and privacy concerns in decentralized machine learning, whereas this +survey uniquely highlights the emerging issue of verifiability. Recognizing the +critical role of zero-knowledge proofs in ensuring verifiability, we present a +comprehensive review of Zero-Knowledge Proof-based Verifiable Machine Learning +(ZKP-VML). To clarify the research problem, we present a definition of ZKP-VML +consisting of four algorithms, along with several corresponding key security +properties. Besides, we provide an overview of the current research landscape +by systematically organizing the research timeline and categorizing existing +schemes based on their security properties. Furthermore, through an in-depth +analysis of each existing scheme, we summarize their technical contributions +and optimization strategies, aiming to uncover common design principles +underlying ZKP-VML schemes. Building on the reviews and analysis presented, we +identify current research challenges and suggest future research directions. To +the best of our knowledge, this is the most comprehensive survey to date on +verifiable decentralized machine learning and ZKP-VML. + +
+
+
+
+
+ + ♻ ☆ Channel-Attentive Graph Neural Networks + + +
+ Graph Neural Networks (GNNs) set the state-of-the-art in representation +learning for graph-structured data. They are used in many domains, from online +social networks to complex molecules. Most GNNs leverage the message-passing +paradigm and achieve strong performances on various tasks. However, the +message-passing mechanism used in most models suffers from over-smoothing as a +GNN's depth increases. The over-smoothing degrades GNN's performance due to the +increased similarity between the representations of unrelated nodes. This study +proposes an adaptive channel-wise message-passing approach to alleviate the +over-smoothing. The proposed model, Channel-Attentive GNN, learns how to attend +to neighboring nodes and their feature channels. Thus, much diverse information +can be transferred between nodes during message-passing. Experiments with +widely used benchmark datasets show that the proposed model is more resistant +to over-smoothing than baselines and achieves state-of-the-art performances for +various graphs with strong heterophily. Our code is at +https://github.com/ALLab-Boun/CHAT-GNN. + +
+
+ comment: Published as a conference paper at IEEE International Conference on + Data Mining 2024 +
+
+
+
+
+ + ♻ ☆ LADDER: Self-Improving LLMs Through Recursive Problem Decomposition + + +
+ We introduce LADDER (Learning through Autonomous Difficulty-Driven Example +Recursion), a framework which enables Large Language Models to autonomously +improve their problem-solving capabilities through self-guided learning by +recursively generating and solving progressively simpler variants of complex +problems. Unlike prior approaches that require curated datasets or human +feedback, LADDER leverages a model's own capabilities to generate easier +question variants. We demonstrate LADDER's effectiveness in the subject of +mathematical integration, improving Llama 3.2 3B's accuracy from 1% to 82% on +undergraduate-level problems and enabling Qwen2.5 7B Deepseek-R1 Distilled to +achieve 73% on the MIT Integration Bee qualifying examination. We also +introduce TTRL (Test-Time Reinforcement Learning), where we perform +reinforcement learning on variants of test problems at inference time. TTRL +enables Qwen2.5 7B Deepseek-R1 Distilled to achieve a state-of-the-art score of +90% on the MIT Integration Bee qualifying examination, surpassing OpenAI o1's +performance. These results show how self-directed strategic learning can +achieve significant capability improvements without relying on architectural +scaling or human supervision. + +
+
+
+
+
+ + ♻ ☆ ChaI-TeA: A Benchmark for Evaluating Autocompletion of Interactions with + LLM-based Chatbots + + +
+ The rise of LLMs has deflected a growing portion of human-computer +interactions towards LLM-based chatbots. The remarkable abilities of these +models allow users to interact using long, diverse natural language text +covering a wide range of topics and styles. Phrasing these messages is a time +and effort consuming task, calling for an autocomplete solution to assist +users. We introduce the task of chatbot interaction autocomplete. We present +ChaI-TeA: CHat InTEraction Autocomplete; An autcomplete evaluation framework +for LLM-based chatbot interactions. The framework includes a formal definition +of the task, coupled with suitable datasets and metrics. We use the framework +to evaluate After formally defining the task along with suitable datasets and +metrics, we test 9 models on the defined auto completion task, finding that +while current off-the-shelf models perform fairly, there is still much room for +improvement, mainly in ranking of the generated suggestions. We provide +insights for practitioners working on this task and open new research +directions for researchers in the field. We release our framework to serve as a +foundation for future research. + +
+
+
+
+
+ + ♻ ☆ AIArena: A Blockchain-Based Decentralized AI Training Platform WWW + + +
+ The rapid advancement of AI has underscored critical challenges in its +development and implementation, largely due to centralized control by a few +major corporations. This concentration of power intensifies biases within AI +models, resulting from inadequate governance and oversight mechanisms. +Additionally, it limits public involvement and heightens concerns about the +integrity of model generation. Such monopolistic control over data and AI +outputs threatens both innovation and fair data usage, as users inadvertently +contribute data that primarily benefits these corporations. In this work, we +propose AIArena, a blockchain-based decentralized AI training platform designed +to democratize AI development and alignment through on-chain incentive +mechanisms. AIArena fosters an open and collaborative environment where +participants can contribute models and computing resources. Its on-chain +consensus mechanism ensures fair rewards for participants based on their +contributions. We instantiate and implement AIArena on the public Base +blockchain Sepolia testnet, and the evaluation results demonstrate the +feasibility of AIArena in real-world applications. + +
+
+ comment: Camera ready version. Accepted by the ACM Web Conference (WWW), 2025 +
+
+
+
+
+ + ♻ ☆ Mitigating the Stability-Plasticity Dilemma in Adaptive Train Scheduling + with Curriculum-Driven Continual DQN Expansion + + +
+ A continual learning agent builds on previous experiences to develop +increasingly complex behaviors by adapting to non-stationary and dynamic +environments while preserving previously acquired knowledge. However, scaling +these systems presents significant challenges, particularly in balancing the +preservation of previous policies with the adaptation of new ones to current +environments. This balance, known as the stability-plasticity dilemma, is +especially pronounced in complex multi-agent domains such as the train +scheduling problem, where environmental and agent behaviors are constantly +changing, and the search space is vast. In this work, we propose addressing +these challenges in the train scheduling problem using curriculum learning. We +design a curriculum with adjacent skills that build on each other to improve +generalization performance. Introducing a curriculum with distinct tasks +introduces non-stationarity, which we address by proposing a new algorithm: +Continual Deep Q-Network (DQN) Expansion (CDE). Our approach dynamically +generates and adjusts Q-function subspaces to handle environmental changes and +task requirements. CDE mitigates catastrophic forgetting through EWC while +ensuring high plasticity using adaptive rational activation functions. +Experimental results demonstrate significant improvements in learning +efficiency and adaptability compared to RL baselines and other adapted methods +for continual learning, highlighting the potential of our method in managing +the stability-plasticity dilemma in the adaptive train scheduling setting. + +
+
+ comment: 9 Pages, 2 Figures +
+
+
+
+
+ + ♻ ☆ UniFlow: A Foundation Model for Unified Urban Spatio-Temporal Flow + Prediction + + +
+ Urban spatio-temporal flow prediction, encompassing traffic flows and crowd +flows, is crucial for optimizing city infrastructure and managing traffic and +emergency responses. Traditional approaches have relied on separate models +tailored to either grid-based data, representing cities as uniform cells, or +graph-based data, modeling cities as networks of nodes and edges. In this +paper, we build UniFlow, a foundational model for general urban flow prediction +that unifies both grid-based and graphbased data. We first design a multi-view +spatio-temporal patching mechanism to standardize different data into a +consistent sequential format and then introduce a spatio-temporal transformer +architecture to capture complex correlations and dynamics. To leverage shared +spatio-temporal patterns across different data types and facilitate effective +cross-learning, we propose SpatioTemporal Memory Retrieval Augmentation +(ST-MRA). By creating structured memory modules to store shared spatio-temporal +patterns, ST-MRA enhances predictions through adaptive memory retrieval. +Extensive experiments demonstrate that UniFlow outperforms existing models in +both grid-based and graph-based flow prediction, excelling particularly in +scenarios with limited data availability, showcasing its superior performance +and broad applicability. The datasets and code implementation have been +released on https://github.com/YuanYuan98/UniFlow. + +
+
+
+
+
+ + ♻ ☆ CodeIF: Benchmarking the Instruction-Following Capabilities of Large + Language Models for Code Generation + + +
+ With the rapid advancement of Large Language Models (LLMs), the demand for +robust instruction-following capabilities in code generation tasks has grown +significantly. Code generation not only facilitates faster prototyping and +automated testing, but also augments developer efficiency through improved +maintainability and reusability of code. In this paper, we introduce CodeIF, +the first benchmark specifically designed to assess the abilities of LLMs to +adhere to task-oriented instructions within diverse code generation scenarios. +CodeIF encompasses a broad range of tasks, including function synthesis, error +debugging, algorithmic refactoring, and code explanation, thereby providing a +comprehensive suite to evaluate model performance across varying complexity +levels and programming domains. We conduct extensive experiments with LLMs, +analyzing their strengths and limitations in meeting the demands of these +tasks. The experimental results offer valuable insights into how well current +models align with human instructions, as well as the extent to which they can +generate consistent, maintainable, and contextually relevant code. Our findings +not only underscore the critical role that instruction-following LLMs can play +in modern software development, but also illuminate pathways for future +research aimed at enhancing their adaptability, reliability, and overall +effectiveness in automated code generation. + +
+
+
+
+
+ + ♻ ☆ Bounding Evidence and Estimating Log-Likelihood in VAE AISTATS 2023 + + +
+ Many crucial problems in deep learning and statistical inference are caused +by a variational gap, i.e., a difference between model evidence +(log-likelihood) and evidence lower bound (ELBO). In particular, in a classical +VAE setting that involves training via an ELBO cost function, it is difficult +to provide a robust comparison of the effects of training between models, since +we do not know a log-likelihood of data (but only its lower bound). In this +paper, to deal with this problem, we introduce a general and effective upper +bound, which allows us to efficiently approximate the evidence of data. We +provide extensive theoretical and experimental studies of our approach, +including its comparison to the other state-of-the-art upper bounds, as well as +its application as a tool for the evaluation of models that were trained on +various lower bounds. + +
+
+ comment: Paper accepted for AISTATS 2023 +
+
+
+
+
+ + ♻ ☆ DrugAgent: Automating AI-aided Drug Discovery Programming through LLM + Multi-Agent Collaboration + + +
+ Recent progress in Large Language Models (LLMs) has drawn attention to their +potential for accelerating drug discovery. However, a central problem remains: +translating theoretical ideas into robust implementations in the highly +specialized context of pharmaceutical research. This limitation prevents +practitioners from making full use of the latest AI developments in drug +discovery. To address this challenge, we introduce DrugAgent, a multi-agent +framework that automates machine learning (ML) programming for drug discovery +tasks. DrugAgent employs an LLM Planner that formulates high-level ideas and an +LLM Instructor that identifies and integrates domain knowledge when +implementing those ideas. We present case studies on three representative drug +discovery tasks. Our results show that DrugAgent consistently outperforms +leading baselines, including a relative improvement of 4.92% in ROC-AUC +compared to ReAct for drug-target interaction (DTI). DrugAgent is publicly +available at https://anonymous.4open.science/r/drugagent-5C42/. + +
+
+
+
+
+ + ♻ ☆ TAG: A Decentralized Framework for Multi-Agent Hierarchical + Reinforcement Learning + + +
+ Hierarchical organization is fundamental to biological systems and human +societies, yet artificial intelligence systems often rely on monolithic +architectures that limit adaptability and scalability. Current hierarchical +reinforcement learning (HRL) approaches typically restrict hierarchies to two +levels or require centralized training, which limits their practical +applicability. We introduce TAME Agent Framework (TAG), a framework for +constructing fully decentralized hierarchical multi-agent systems. TAG enables +hierarchies of arbitrary depth through a novel LevelEnv concept, which +abstracts each hierarchy level as the environment for the agents above it. This +approach standardizes information flow between levels while preserving loose +coupling, allowing for seamless integration of diverse agent types. We +demonstrate the effectiveness of TAG by implementing hierarchical architectures +that combine different RL agents across multiple levels, achieving improved +performance over classical multi-agent RL baselines on standard benchmarks. Our +results show that decentralized hierarchical organization enhances both +learning speed and final performance, positioning TAG as a promising direction +for scalable multi-agent systems. + +
+
+
+
+
+ + ♻ ☆ Exploration Implies Data Augmentation: Reachability and Generalisation + in Contextual MDPs + + +
+ In the zero-shot policy transfer (ZSPT) setting for contextual Markov +decision processes (MDP), agents train on a fixed set of contexts and must +generalise to new ones. Recent work has argued and demonstrated that increased +exploration can improve this generalisation, by training on more states in the +training contexts. In this paper, we demonstrate that training on more states +can indeed improve generalisation, but can come at a cost of reducing the +accuracy of the learned value function which should not benefit generalisation. +We introduce reachability in the ZSPT setting to define which states/contexts +require generalisation and explain why exploration can improve it. We +hypothesise and demonstrate that using exploration to increase the agent's +coverage while also increasing the accuracy improves generalisation even more. +Inspired by this, we propose a method Explore-Go that implements an exploration +phase at the beginning of each episode, which can be combined with existing on- +and off-policy RL algorithms and significantly improves generalisation even in +partially observable MDPs. We demonstrate the effectiveness of Explore-Go when +combined with several popular algorithms and show an increase in generalisation +performance across several environments. With this, we hope to provide +practitioners with a simple modification that can improve the generalisation of +their agents. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2406.08069 +
+
+
+
+
+ + ♻ ☆ Unifying Causal Representation Learning with the Invariance Principle ICLR2025 + + +
+ Causal representation learning (CRL) aims at recovering latent causal +variables from high-dimensional observations to solve causal downstream tasks, +such as predicting the effect of new interventions or more robust +classification. A plethora of methods have been developed, each tackling +carefully crafted problem settings that lead to different types of +identifiability. These different settings are widely assumed to be important +because they are often linked to different rungs of Pearl's causal hierarchy, +even though this correspondence is not always exact. This work shows that +instead of strictly conforming to this hierarchical mapping, many causal +representation learning approaches methodologically align their representations +with inherent data symmetries. Identification of causal variables is guided by +invariance principles that are not necessarily causal. This result allows us to +unify many existing approaches in a single method that can mix and match +different assumptions, including non-causal ones, based on the invariance +relevant to the problem at hand. It also significantly benefits applicability, +which we demonstrate by improving treatment effect estimation on real-world +high-dimensional ecological data. Overall, this paper clarifies the role of +causal assumptions in the discovery of causal variables and shifts the focus to +preserving data symmetries. + +
+
+ comment: ICLR2025 Camera ready +
+
+
+
+
+ + ♻ ☆ From Learning to Optimize to Learning Optimization Algorithms AISTATS 2025 + + +
+ Towards designing learned optimization algorithms that are usable beyond +their training setting, we identify key principles that classical algorithms +obey, but have up to now, not been used for Learning to Optimize (L2O). +Following these principles, we provide a general design pipeline, taking into +account data, architecture and learning strategy, and thereby enabling a +synergy between classical optimization and L2O, resulting in a philosophy of +Learning Optimization Algorithms. As a consequence our learned algorithms +perform well far beyond problems from the training distribution. We demonstrate +the success of these novel principles by designing a new learning-enhanced BFGS +algorithm and provide numerical experiments evidencing its adaptation to many +settings at test time. + +
+
+ comment: To appear at AISTATS 2025 +
+
+
+
+
+ + ♻ ☆ State Space Models are Provably Comparable to Transformers in Dynamic + Token Selection + + +
+ Deep neural networks based on state space models (SSMs) are attracting +significant attention in sequence modeling since their computational cost is +much smaller than that of Transformers. While the capabilities of SSMs have +been demonstrated through experiments in various tasks, theoretical +understanding of SSMs is still limited. In particular, most theoretical studies +discuss the capabilities of SSM layers without nonlinear layers, and there is a +lack of discussion on their combination with nonlinear layers. In this paper, +we explore the capabilities of SSMs combined with fully connected neural +networks, and show that they are comparable to Transformers in extracting the +essential tokens depending on the input. As concrete examples, we consider two +synthetic tasks, which are challenging for a single SSM layer, and demonstrate +that SSMs combined with nonlinear layers can efficiently solve these tasks. +Furthermore, we study the nonparametric regression task, and prove that the +ability of SSMs is equivalent to that of Transformers in estimating functions +belonging to a certain class. + +
+
+ comment: 43 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Narrowing the Gap between Adversarial and Stochastic MDPs via Policy + Optimization + + +
+ We consider the problem of learning in adversarial Markov decision processes +[MDPs] with an oblivious adversary in a full-information setting. The agent +interacts with an environment during $T$ episodes, each of which consists of +$H$ stages, and each episode is evaluated with respect to a reward function +that will be revealed only at the end of the episode. We propose an algorithm, +called APO-MVP, that achieves a regret bound of order +$\tilde{\mathcal{O}}(\mathrm{poly}(H)\sqrt{SAT})$, where $S$ and $A$ are sizes +of the state and action spaces, respectively. This result improves upon the +best-known regret bound by a factor of $\sqrt{S}$, bridging the gap between +adversarial and stochastic MDPs, and matching the minimax lower bound +$\Omega(\sqrt{H^3SAT})$ as far as the dependencies in $S,A,T$ are concerned. +The proposed algorithm and analysis completely avoid the typical tool given by +occupancy measures; instead, it performs policy optimization based only on +dynamic programming and on a black-box online linear optimization strategy run +over estimated advantage functions, making it easy to implement. The analysis +leverages two recent techniques: policy optimization based on online linear +optimization strategies (Jonckheere et al., 2023) and a refined martingale +analysis of the impact on values of estimating transitions kernels (Zhang et +al., 2023). + +
+
+
+
+
+ + ♻ ☆ Distributed Differentially Private Data Analytics via Secure Sketching + + +
+ We introduce the linear-transformation model, a distributed model of +differentially private data analysis. Clients have access to a trusted platform +capable of applying a public matrix to their inputs. Such computations can be +securely distributed across multiple servers using simple and efficient secure +multiparty computation techniques. + The linear-transformation model serves as an intermediate model between the +highly expressive central model and the minimal local model. In the central +model, clients have access to a trusted platform capable of applying any +function to their inputs. However, this expressiveness comes at a cost, as it +is often prohibitively expensive to distribute such computations, leading to +the central model typically being implemented by a single trusted server. In +contrast, the local model assumes no trusted platform, which forces clients to +add significant noise to their data. The linear-transformation model avoids the +single point of failure for privacy present in the central model, while also +mitigating the high noise required in the local model. + We demonstrate that linear transformations are very useful for differential +privacy, allowing for the computation of linear sketches of input data. These +sketches largely preserve utility for tasks such as private low-rank +approximation and private ridge regression, while introducing only minimal +error, critically independent of the number of clients. + +
+
+
+
+
+ + ♻ ☆ Task-optimal data-driven surrogate models for eNMPC via differentiable + simulation and optimization + + +
+ Mechanistic dynamic process models may be too computationally expensive to be +usable as part of a real-time capable predictive controller. We present a +method for end-to-end learning of Koopman surrogate models for optimal +performance in a specific control task. In contrast to previous contributions +that employ standard reinforcement learning (RL) algorithms, we use a training +algorithm that exploits the differentiability of environments based on +mechanistic simulation models to aid the policy optimization. We evaluate the +performance of our method by comparing it to that of other training algorithms +on an existing economic nonlinear model predictive control (eNMPC) case study +of a continuous stirred-tank reactor (CSTR) model. Compared to the benchmark +methods, our method produces similar economic performance while eliminating +constraint violations. Thus, for this case study, our method outperforms the +others and offers a promising path toward more performant controllers that +employ dynamic surrogate models. + +
+
+ comment: 8 pages, 4 figures, 1 table +
+
+
+
+
+ + ♻ ☆ DarwinLM: Evolutionary Structured Pruning of Large Language Models + + +
+ Large Language Models (LLMs) have achieved significant success across various +NLP tasks. However, their massive computational costs limit their widespread +use, particularly in real-time applications. Structured pruning offers an +effective solution by compressing models and directly providing end-to-end +speed improvements, regardless of the hardware environment. Meanwhile, +different components of the model exhibit varying sensitivities towards +pruning, calling for non-uniform model compression. However, a pruning method +should not only identify a capable substructure, but also account for +post-compression training. To this end, we propose DarwinLM, a method for +training-aware structured pruning. DarwinLM builds upon an evolutionary search +process, generating multiple offspring models in each generation through +mutation, and selecting the fittest for survival. To assess the effect of +post-training, we incorporate a lightweight, multistep training process within +the offspring population, progressively increasing the number of tokens and +eliminating poorly performing models in each selection stage. We validate our +method through extensive experiments on Llama-2-7B, Llama-3.1-8B and +Qwen-2.5-14B-Instruct, achieving state-of-the-art performance for structured +pruning. For instance, DarwinLM surpasses ShearedLlama while requiring 5x less +training data during post-compression training. Code is at: +https://github.com/IST-DASLab/DarwinLM + +
+
+ comment: Code: https://github.com/IST-DASLab/DarwinLM +
+
+
+
+
+ + ♻ ☆ Verifiable and Provably Secure Machine Unlearning + + +
+ Machine unlearning aims to remove points from the training dataset of a +machine learning model after training: e.g., when a user requests their data to +be deleted. While many unlearning methods have been proposed, none of them +enable users to audit the procedure. Furthermore, recent work shows a user is +unable to verify whether their data was unlearnt from an inspection of the +model parameter alone. Rather than reasoning about parameters, we propose to +view verifiable unlearning as a security problem. To this end, we present the +first cryptographic definition of verifiable unlearning to formally capture the +guarantees of an unlearning system. In this framework, the server first +computes a proof that the model was trained on a dataset D. Given a user's data +point d requested to be deleted, the server updates the model using an +unlearning algorithm. It then provides a proof of the correct execution of +unlearning and that d is not part of D', where D' is the new training dataset +(i.e., d has been removed). Our framework is generally applicable to different +unlearning techniques that we abstract as admissible functions. We instantiate +a protocol in the framework, based on cryptographic assumptions, using SNARKs +and hash chains. Finally, we implement the protocol for three different +unlearning techniques and validate its feasibility for linear regression, +logistic regression, and neural networks. + +
+
+ comment: Accepted at IEEE SaTML2025 +
+
+
+
+
+ + ♻ ☆ Solving Inverse Problem for Multi-armed Bandits via Convex Optimization + + +
+ We consider the inverse problem of multi-armed bandits (IMAB) that are widely +used in neuroscience and psychology research for behavior modelling. We first +show that the IMAB problem is not convex in general, but can be relaxed to a +convex problem via variable transformation. Based on this result, we propose a +two-step sequential heuristic for (approximately) solving the IMAB problem. We +discuss a condition where our method provides global solution to the IMAB +problem with certificate, as well as approximations to further save computing +time. Numerical experiments indicate that our heuristic method is more robust +than directly solving the IMAB problem via repeated local optimization, and can +achieve the performance of Monte Carlo methods within a significantly decreased +running time. We provide the implementation of our method based on CVXPY, which +allows straightforward application by users not well versed in convex +optimization. + +
+
+
+
+
+ + ♻ ☆ Iterative Value Function Optimization for Guided Decoding + + +
+ While Reinforcement Learning from Human Feedback (RLHF) has become the +predominant method for controlling language model outputs, it suffers from high +computational costs and training instability. Guided decoding, especially +value-guided methods, offers a cost-effective alternative by controlling +outputs without re-training models. However, the accuracy of the value function +is crucial for value-guided decoding, as inaccuracies can lead to suboptimal +decision-making and degraded performance. Existing methods struggle with +accurately estimating the optimal value function, leading to less effective +control. We propose Iterative Value Function Optimization, a novel framework +that addresses these limitations through two key components: Monte Carlo Value +Estimation, which reduces estimation variance by exploring diverse +trajectories, and Iterative On-Policy Optimization, which progressively +improves value estimation through collecting trajectories from value-guided +policies. Extensive experiments on text summarization, multi-turn dialogue, and +instruction following demonstrate the effectiveness of value-guided decoding +approaches in aligning language models. These approaches not only achieve +alignment but also significantly reduce computational costs by leveraging +principled value function optimization for efficient and effective control. + +
+
+ comment: 20 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ POMDP-Driven Cognitive Massive MIMO Radar: Joint Target + Detection-Tracking In Unknown Disturbances + + +
+ The joint detection and tracking of a moving target embedded in an unknown +disturbance represents a key feature that motivates the development of the +cognitive radar paradigm. Building upon recent advancements in robust target +detection with multiple-input multiple-output (MIMO) radars, this work explores +the application of a Partially Observable Markov Decision Process (POMDP) +framework to enhance the tracking and detection tasks in a statistically +unknown environment. In the POMDP setup, the radar system is considered as an +intelligent agent that continuously senses the surrounding environment, +optimizing its actions to maximize the probability of detection $(P_D)$ and +improve the target position and velocity estimation, all this while keeping a +constant probability of false alarm $(P_{FA})$. The proposed approach employs +an online algorithm that does not require any apriori knowledge of the noise +statistics, and it relies on a much more general observation model than the +traditional range-azimuth-elevation model employed by conventional tracking +algorithms. Simulation results clearly show substantial performance improvement +of the POMDP-based algorithm compared to the State-Action-Reward-State-Action +(SARSA)-based one that has been recently investigated in the context of massive +MIMO (MMIMO) radar systems. + +
+
+ comment: The paper has been submitted to ieee Transactions on radar systems +
+
+
+
+
+ + ♻ ☆ Gated Delta Networks: Improving Mamba2 with Delta Rule ICLR 2025 + + +
+ Linear Transformers have gained attention as efficient alternatives to +standard Transformers, but their performance in retrieval and long-context +tasks has been limited. To address these limitations, recent work has explored +two distinct mechanisms: gating for adaptive memory control and the delta +update rule for precise memory modifications. We observe that these mechanisms +are complementary: gating enables rapid memory erasure while the delta rule +facilitates targeted updates. Building on this insight, we introduce the gated +delta rule and develop a parallel training algorithm optimized for modern +hardware. Our proposed architecture, Gated DeltaNet, consistently surpasses +existing models like Mamba2 and DeltaNet across multiple benchmarks, including +language modeling, common-sense reasoning, in-context retrieval, length +extrapolation, and long-context understanding. We further enhance performance +by developing hybrid architectures that combine Gated DeltaNet layers with +sliding window attention or Mamba2 layers, achieving both improved training +efficiency and superior task performance. + +
+
+ comment: ICLR 2025 camera ready +
+
+
+
+
+ + ♻ ☆ ChemVLM: Exploring the Power of Multimodal Large Language Models in + Chemistry Area + + +
+ Large Language Models (LLMs) have achieved remarkable success and have been +applied across various scientific fields, including chemistry. However, many +chemical tasks require the processing of visual information, which cannot be +successfully handled by existing chemical LLMs. This brings a growing need for +models capable of integrating multimodal information in the chemical domain. In +this paper, we introduce \textbf{ChemVLM}, an open-source chemical multimodal +large language model specifically designed for chemical applications. ChemVLM +is trained on a carefully curated bilingual multimodal dataset that enhances +its ability to understand both textual and visual chemical information, +including molecular structures, reactions, and chemistry examination questions. +We develop three datasets for comprehensive evaluation, tailored to Chemical +Optical Character Recognition (OCR), Multimodal Chemical Reasoning (MMCR), and +Multimodal Molecule Understanding tasks. We benchmark ChemVLM against a range +of open-source and proprietary multimodal large language models on various +tasks. Experimental results demonstrate that ChemVLM achieves competitive +performance across all evaluated tasks. Our model can be found at +https://huggingface.co/AI4Chem/ChemVLM-26B. + +
+
+ comment: 11 pages, updated version +
+
+
+
+
+ + ♻ ☆ Flow-based Bayesian filtering for high-dimensional nonlinear stochastic + dynamical systems + + +
+ Bayesian filtering for high-dimensional nonlinear stochastic dynamical +systems is a fundamental yet challenging problem in many fields of science and +engineering. Existing methods face significant obstacles: Gaussian-based +filters struggle with non-Gaussian distributions, while sequential Monte Carlo +methods are computationally intensive and prone to particle degeneracy in high +dimensions. Although generative models in machine learning have made +significant progress in modeling high-dimensional non-Gaussian distributions, +their inefficiency in online updating limits their applicability to filtering +problems. To address these challenges, we propose a flow-based Bayesian filter +(FBF) that integrates normalizing flows to construct a novel latent linear +state-space model with Gaussian filtering distributions. This framework +facilitates efficient density estimation and sampling using invertible +transformations provided by normalizing flows, and it enables the construction +of filters in a data-driven manner, without requiring prior knowledge of system +dynamics or observation models. Numerical experiments demonstrate the superior +accuracy and efficiency of FBF. + +
+
+
+
+
+ + ♻ ☆ Learning High-Degree Parities: The Crucial Role of the Initialization + + +
+ Parities have become a standard benchmark for evaluating learning algorithms. +Recent works show that regular neural networks trained by gradient descent can +efficiently learn degree $k$ parities on uniform inputs for constant $k$, but +fail to do so when $k$ and $d-k$ grow with $d$ (here $d$ is the ambient +dimension). However, the case where $k=d-O_d(1)$ (almost-full parities), +including the degree $d$ parity (the full parity), has remained unsettled. This +paper shows that for gradient descent on regular neural networks, learnability +depends on the initial weight distribution. On one hand, the discrete +Rademacher initialization enables efficient learning of almost-full parities, +while on the other hand, its Gaussian perturbation with large enough constant +standard deviation $\sigma$ prevents it. The positive result for almost-full +parities is shown to hold up to $\sigma=O(d^{-1})$, pointing to questions about +a sharper threshold phenomenon. Unlike statistical query (SQ) learning, where a +singleton function class like the full parity is trivially learnable, our +negative result applies to a fixed function and relies on an initial gradient +alignment measure of potential broader relevance to neural networks learning. + +
+
+
+
+
+ + ♻ ☆ Breaking Class Barriers: Efficient Dataset Distillation via Inter-Class + Feature Compensator ICLR 2025 + + +
+ Dataset distillation has emerged as a technique aiming to condense +informative features from large, natural datasets into a compact and synthetic +form. While recent advancements have refined this technique, its performance is +bottlenecked by the prevailing class-specific synthesis paradigm. Under this +paradigm, synthetic data is optimized exclusively for a pre-assigned one-hot +label, creating an implicit class barrier in feature condensation. This leads +to inefficient utilization of the distillation budget and oversight of +inter-class feature distributions, which ultimately limits the effectiveness +and efficiency, as demonstrated in our analysis. To overcome these constraints, +this paper presents the Inter-class Feature Compensator (INFER), an innovative +distillation approach that transcends the class-specific data-label framework +widely utilized in current dataset distillation methods. Specifically, INFER +leverages a Universal Feature Compensator (UFC) to enhance feature integration +across classes, enabling the generation of multiple additional synthetic +instances from a single UFC input. This significantly improves the efficiency +of the distillation budget. Moreover, INFER enriches inter-class interactions +during the distillation, thereby enhancing the effectiveness and +generalizability of the distilled data. By allowing for the linear +interpolation of labels similar to those in the original dataset, INFER +meticulously optimizes the synthetic data and dramatically reduces the size of +soft labels in the synthetic dataset to almost zero, establishing a new +benchmark for efficiency and effectiveness in dataset distillation. In +practice, INFER demonstrates state-of-the-art performance across benchmark +datasets. For instance, in the ipc = 50 setting on ImageNet-1k with the same +compression level, it outperforms SRe2L by 34.5% using ResNet18. + +
+
+ comment: Accepted to ICLR 2025 +
+
+
+
+
+ + ♻ ☆ DP-LDMs: Differentially Private Latent Diffusion Models + + +
+ Diffusion models (DMs) are one of the most widely used generative models for +producing high quality images. However, a flurry of recent papers points out +that DMs are least private forms of image generators, by extracting a +significant number of near-identical replicas of training images from DMs. +Existing privacy-enhancing techniques for DMs, unfortunately, do not provide a +good privacy-utility tradeoff. In this paper, we aim to improve the current +state of DMs with differential privacy (DP) by adopting the $\textit{Latent}$ +Diffusion Models (LDMs). LDMs are equipped with powerful pre-trained +autoencoders that map the high-dimensional pixels into lower-dimensional latent +representations, in which DMs are trained, yielding a more efficient and fast +training of DMs. Rather than fine-tuning the entire LDMs, we fine-tune only the +$\textit{attention}$ modules of LDMs with DP-SGD, reducing the number of +trainable parameters by roughly $90\%$ and achieving a better privacy-accuracy +trade-off. Our approach allows us to generate realistic, high-dimensional +images (256x256) conditioned on text prompts with DP guarantees, which, to the +best of our knowledge, has not been attempted before. Our approach provides a +promising direction for training more powerful, yet training-efficient +differentially private DMs, producing high-quality DP images. Our code is +available at https://anonymous.4open.science/r/DP-LDM-4525. + +
+
+
+
+
+ + ♻ ☆ Improved Performances and Motivation in Intelligent Tutoring Systems: + Combining Machine Learning and Learner Choice + + +
+ Large class sizes challenge personalized learning in schools, prompting the +use of educational technologies such as intelligent tutoring systems. To +address this, we present an AI-driven personalization system, called ZPDES, +based on the Learning Progress Hypothesis - modeling curiosity-driven learning +- and multi-armed bandit techniques. It sequences exercises that maximize +learning progress for each student. While previous studies demonstrated its +efficacy in enhancing learning compared to hand-made curricula, its impact on +student motivation remained unexplored. Furthermore, ZPDES previously lacked +features allowing student choice, a limitation in agency that conflicts with +its foundation on models of curiosity-driven learning. This study investigates +how integrating choice, as a gamification element unrelated to exercise +difficulty, affects both learning outcomes and motivation. We conducted an +extensive field study (265 7-8 years old children, RCT design), comparing ZPDES +with and without choice against a hand-designed curriculum. Results show that +ZPDES improves both learning performance and the learning experience. Moreover +adding choice to ZPDES enhances intrinsic motivation and further strengthens +its learning benefits. In contrast, incorporating choice into a fixed, linear +curriculum negatively impacts learning outcomes. These findings highlight that +the intrinsic motivation elicited by choice (gamification) is beneficial only +when paired with an adaptive personalized learning system. This insight is +critical as gamified features become increasingly prevalent in educational +technologies. + +
+
+
+
+
+ + ♻ ☆ Regularization-based Framework for Quantization-, Fault- and + Variability-Aware Training + + +
+ Efficient inference is critical for deploying deep learning models on edge AI +devices. Low-bit quantization (e.g., 3- and 4-bit) with fixed-point arithmetic +improves efficiency, while low-power memory technologies like analog +nonvolatile memory enable further gains. However, these methods introduce +non-ideal hardware behavior, including bit faults and device-to-device +variability. We propose a regularization-based quantization-aware training +(QAT) framework that supports fixed, learnable step-size, and learnable +non-uniform quantization, achieving competitive results on CIFAR-10 and +ImageNet. Our method also extends to Spiking Neural Networks (SNNs), +demonstrating strong performance on 4-bit networks on CIFAR10-DVS and N-Caltech +101. Beyond quantization, our framework enables fault and variability-aware +fine-tuning, mitigating stuck-at faults (fixed weight bits) and device +resistance variability. Compared to prior fault-aware training, our approach +significantly improves performance recovery under upto 20% bit-fault rate and +40% device-to-device variability. Our results establish a generalizable +framework for quantization and robustness-aware training, enhancing efficiency +and reliability in low-power, non-ideal hardware. + +
+
+ comment: AB and RS contributed equally to this work. A version of this paper + accepted at MLNCP @ NeuRIPS '24 +
+
+
+
+
+ + ♻ ☆ Grams: Gradient Descent with Adaptive Momentum Scaling SC + + +
+ We introduce $\mathbf{G}$radient Descent with $\mathbf{A}$daptive +$\mathbf{M}$omentum $\mathbf{S}$caling ($\mathbf{Grams}$), a novel optimization +algorithm that decouples the direction and magnitude of parameter updates in +deep learning. Unlike traditional optimizers that directly integrate momentum +into updates, Grams separates the update direction, derived from current +gradients, from momentum, which is used solely for adaptive magnitude scaling. +This approach enables Grams to achieve improved loss descent compared to +state-of-the-art cautious and momentum-based optimizers. We theoretically +demonstrate that Grams descents faster than other state-of-the-art optimizers +and establish a global convergence guarantee for Grams. We also validate its +effectiveness through extensive empirical evaluations. The results demonstrate +Grams' superior performance, including faster convergence and better +generalization, compared to widely-used optimizers such as Adam, Lion, and +their cautious variants. Our results highlight Grams' potential as a +transformative approach for efficiently training and fine-tuning large language +models. Code is available at https://github.com/Gunale0926/Grams. + +
+
+ comment: SCOPE Workshop @ ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Revisiting Random Walks for Learning on Graphs + + +
+ We revisit a simple model class for machine learning on graphs, where a +random walk on a graph produces a machine-readable record, and this record is +processed by a deep neural network to directly make vertex-level or graph-level +predictions. We call these stochastic machines random walk neural networks +(RWNNs), and through principled analysis, show that we can design them to be +isomorphism invariant while capable of universal approximation of graph +functions in probability. A useful finding is that almost any kind of record of +random walks guarantees probabilistic invariance as long as the vertices are +anonymized. This enables us, for example, to record random walks in plain text +and adopt a language model to read these text records to solve graph tasks. We +further establish a parallelism to message passing neural networks using tools +from Markov chain theory, and show that over-smoothing in message passing is +alleviated by construction in RWNNs, while over-squashing manifests as +probabilistic under-reaching. We empirically demonstrate RWNNs on a range of +problems, verifying our theoretical analysis and demonstrating the use of +language models for separating strongly regular graphs where 3-WL test fails, +and transductive classification on arXiv citation network. Code is available at +https://github.com/jw9730/random-walk. + +
+
+ comment: 51 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Training a Generally Curious Agent + + +
+ Efficient exploration is essential for intelligent systems interacting with +their environment, but existing language models often fall short in scenarios +that require strategic information gathering. In this paper, we present +PAPRIKA, a fine-tuning approach that enables language models to develop general +decision-making capabilities that are not confined to particular environments. +By training on synthetic interaction data from different tasks that require +diverse strategies, PAPRIKA teaches models to explore and adapt their behavior +on a new task based on environment feedback in-context without more gradient +updates. Experimental results show that models fine-tuned with PAPRIKA can +effectively transfer their learned decision-making capabilities to entirely +unseen tasks without additional training. Unlike traditional training, our +approach's primary bottleneck lies in sampling useful interaction data instead +of model updates. To improve sample efficiency, we propose a curriculum +learning strategy that prioritizes sampling trajectories from tasks with high +learning potential. These results suggest a promising path towards AI systems +that can autonomously solve novel sequential decision-making problems that +require interactions with the external world. + +
+
+ comment: Project Website: https://paprika-llm.github.io +
+
+
+
+
+ + ♻ ☆ Affordance-Guided Reinforcement Learning via Visual Prompting + + +
+ Robots equipped with reinforcement learning (RL) have the potential to learn +a wide range of skills solely from a reward signal. However, obtaining a robust +and dense reward signal for general manipulation tasks remains a challenge. +Existing learning-based approaches require significant data, such as human +demonstrations of success and failure, to learn task-specific reward functions. +Recently, there is also a growing adoption of large multi-modal foundation +models for robotics that can perform visual reasoning in physical contexts and +generate coarse robot motions for manipulation tasks. Motivated by this range +of capability, in this work, we present Keypoint-based Affordance Guidance for +Improvements (KAGI), a method leveraging rewards shaped by vision-language +models (VLMs) for autonomous RL. State-of-the-art VLMs have demonstrated +impressive reasoning about affordances through keypoints in zero-shot, and we +use these to define dense rewards that guide autonomous robotic learning. On +real-world manipulation tasks specified by natural language descriptions, KAGI +improves the sample efficiency of autonomous RL and enables successful task +completion in 30K online fine-tuning steps. Additionally, we demonstrate the +robustness of KAGI to reductions in the number of in-domain demonstrations used +for pre-training, reaching similar performance in 45K online fine-tuning steps. +Project website: https://sites.google.com/view/affordance-guided-rl + +
+
+ comment: 8 pages, 6 figures. Robotics: Science and Systems (RSS) 2024, Task + Specification for General-Purpose Intelligent Robots & Lifelong Robot + Learning Workshops +
+
+
+
+
+ + ♻ ☆ $μ^2$-SGD: Stable Stochastic Optimization via a Double Momentum + Mechanism + + +
+ We consider stochastic convex optimization problems where the objective is an +expectation over smooth functions. For this setting we suggest a novel gradient +estimate that combines two recent mechanism that are related to notion of +momentum. Then, we design an SGD-style algorithm as well as an accelerated +version that make use of this new estimator, and demonstrate the robustness of +these new approaches to the choice of the learning rate. Concretely, we show +that these approaches obtain the optimal convergence rates for both noiseless +and noisy case with the same choice of fixed learning rate. Moreover, for the +noisy case we show that these approaches achieve the same optimal bound for a +very wide range of learning rates. + +
+
+
+
+
+ + ♻ ☆ CarPlanner: Consistent Auto-regressive Trajectory Planning for + Large-scale Reinforcement Learning in Autonomous Driving CVPR 2025 + + +
+ Trajectory planning is vital for autonomous driving, ensuring safe and +efficient navigation in complex environments. While recent learning-based +methods, particularly reinforcement learning (RL), have shown promise in +specific scenarios, RL planners struggle with training inefficiencies and +managing large-scale, real-world driving scenarios. In this paper, we introduce +\textbf{CarPlanner}, a \textbf{C}onsistent \textbf{a}uto-\textbf{r}egressive +\textbf{Planner} that uses RL to generate multi-modal trajectories. The +auto-regressive structure enables efficient large-scale RL training, while the +incorporation of consistency ensures stable policy learning by maintaining +coherent temporal consistency across time steps. Moreover, CarPlanner employs a +generation-selection framework with an expert-guided reward function and an +invariant-view module, simplifying RL training and enhancing policy +performance. Extensive analysis demonstrates that our proposed RL framework +effectively addresses the challenges of training efficiency and performance +enhancement, positioning CarPlanner as a promising solution for trajectory +planning in autonomous driving. To the best of our knowledge, we are the first +to demonstrate that the RL-based planner can surpass both IL- and rule-based +state-of-the-arts (SOTAs) on the challenging large-scale real-world dataset +nuPlan. Our proposed CarPlanner surpasses RL-, IL-, and rule-based SOTA +approaches within this demanding dataset. + +
+
+ comment: CVPR 2025 +
+
+
+
+
+ + ♻ ☆ CPT-Boosted Wav2vec2.0: Towards Noise Robust Speech Recognition for + Classroom Environments + + +
+ Creating Automatic Speech Recognition (ASR) systems that are robust and +resilient to classroom conditions is paramount to the development of AI tools +to aid teachers and students. In this work, we study the efficacy of continued +pretraining (CPT) in adapting Wav2vec2.0 to the classroom domain. We show that +CPT is a powerful tool in that regard and reduces the Word Error Rate (WER) of +Wav2vec2.0-based models by upwards of 10%. More specifically, CPT improves the +model's robustness to different noises, microphones and classroom conditions. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2405.13018 +
+
+
+
+
+ + ♻ ☆ BRIDGE: Bootstrapping Text to Control Time-Series Generation via + Multi-Agent Iterative Optimization and Diffusion Modelling + + +
+ Time-series Generation (TSG) is a prominent research area with broad +applications in simulations, data augmentation, and counterfactual analysis. +While existing methods have shown promise in unconditional single-domain TSG, +real-world applications demand for cross-domain approaches capable of +controlled generation tailored to domain-specific constraints and +instance-level requirements. In this paper, we argue that text can provide +semantic insights, domain information and instance-specific temporal patterns, +to guide and improve TSG. We introduce ``Text-Controlled TSG'', a task focused +on generating realistic time series by incorporating textual descriptions. To +address data scarcity in this setting, we propose a novel LLM-based Multi-Agent +framework that synthesizes diverse, realistic text-to-TS datasets. Furthermore, +we introduce BRIDGE, a hybrid text-controlled TSG framework that integrates +semantic prototypes with text description for supporting domain-level guidance. +This approach achieves state-of-the-art generation fidelity on 11 of 12 +datasets, and improves controllability by 12.52% on MSE and 6.34% MAE compared +to no text input generation, highlighting its potential for generating tailored +time-series data. + +
+
+ comment: Preprint. Work in progress +
+
+
+
+
+ + ♻ Online Planning for Multi-UAV Pursuit-Evasion in Unknown Environments + Using Deep Reinforcement Learning + + +
+ Multi-UAV pursuit-evasion, where pursuers aim to capture evaders, poses a key +challenge for UAV swarm intelligence. Multi-agent reinforcement learning (MARL) +has demonstrated potential in modeling cooperative behaviors, but most RL-based +approaches remain constrained to simplified simulations with limited dynamics +or fixed scenarios. Previous attempts to deploy RL policy to real-world +pursuit-evasion are largely restricted to two-dimensional scenarios, such as +ground vehicles or UAVs at fixed altitudes. In this paper, we address multi-UAV +pursuit-evasion by considering UAV dynamics and physical constraints. We +introduce an evader prediction-enhanced network to tackle partial observability +in cooperative strategy learning. Additionally, we propose an adaptive +environment generator within MARL training, enabling higher exploration +efficiency and better policy generalization across diverse scenarios. +Simulations show our method significantly outperforms all baselines in +challenging scenarios, generalizing to unseen scenarios with a 100% capture +rate. Finally, we derive a feasible policy via a two-stage reward refinement +and deploy the policy on real quadrotors in a zero-shot manner. To our +knowledge, this is the first work to derive and deploy an RL-based policy using +collective thrust and body rates control commands for multi-UAV pursuit-evasion +in unknown environments. The open-source code and videos are available at +https://sites.google.com/view/pursuit-evasion-rl. + +
+
+
+
+
+ + ♻ ☆ LoBAM: LoRA-Based Backdoor Attack on Model Merging + + +
+ Model merging is an emerging technique that integrates multiple models +fine-tuned on different tasks to create a versatile model that excels in +multiple domains. This scheme, in the meantime, may open up backdoor attack +opportunities where one single malicious model can jeopardize the integrity of +the merged model. Existing works try to demonstrate the risk of such attacks by +assuming substantial computational resources, focusing on cases where the +attacker can fully fine-tune the pre-trained model. Such an assumption, +however, may not be feasible given the increasing size of machine learning +models. In practice where resources are limited and the attacker can only +employ techniques like Low-Rank Adaptation (LoRA) to produce the malicious +model, it remains unclear whether the attack can still work and pose threats. +In this work, we first identify that the attack efficacy is significantly +diminished when using LoRA for fine-tuning. Then, we propose LoBAM, a method +that yields high attack success rate with minimal training resources. The key +idea of LoBAM is to amplify the malicious weights in an intelligent way that +effectively enhances the attack efficacy. We demonstrate that our design can +lead to improved attack success rate through extensive empirical experiments +across various model merging scenarios. Moreover, we show that our method is +highly stealthy and is difficult to detect and defend against. + +
+
+
+
+
+ + ♻ ☆ SePer: Measure Retrieval Utility Through The Lens Of Semantic Perplexity + Reduction ICLR 2025 + + +
+ Large Language Models (LLMs) have demonstrated improved generation +performance by incorporating externally retrieved knowledge, a process known as +retrieval-augmented generation (RAG). Despite the potential of this approach, +existing studies evaluate RAG effectiveness by 1) assessing retrieval and +generation components jointly, which obscures retrieval's distinct +contribution, or 2) examining retrievers using traditional metrics such as +NDCG, which creates a gap in understanding retrieval's true utility in the +overall generation process. To address the above limitations, in this work, we +introduce an automatic evaluation method that measures retrieval quality +through the lens of information gain within the RAG framework. Specifically, we +propose Semantic Perplexity (SePer), a metric that captures the LLM's internal +belief about the correctness of the retrieved information. We quantify the +utility of retrieval by the extent to which it reduces semantic perplexity +post-retrieval. Extensive experiments demonstrate that SePer not only aligns +closely with human preferences but also offers a more precise and efficient +evaluation of retrieval utility across diverse RAG scenarios. + +
+
+ comment: ICLR 2025 Spotlight +
+
+
+
+
+
+
+
+ + Multimedia 2 + +
+
+
+ + ♻ ☆ More than Memes: A Multimodal Topic Modeling Approach to Conspiracy + Theories on Telegram + + +
+ To address the increasing prevalence of (audio-)visual data on social media, +and to capture the evolving and dynamic nature of this communication, +researchers have begun to explore the potential of unsupervised approaches for +analyzing multimodal online content. However, existing research often neglects +visual content beyond memes, and in addition lacks methods to compare topic +models across modalities. Our study addresses these gaps by applying multimodal +topic modeling for analyzing conspiracy theories in German-language Telegram +channels. We use BERTopic with CLIP for the analysis of textual and visual data +in a corpus of ~40, 000 Telegram messages posted in October 2023 in 571 +German-language Telegram channels known for disseminating conspiracy theories. +Through this dataset, we provide insights into unimodal and multimodal topic +models by analyzing symmetry and intersections of topics across modalities. We +demonstrate the variety of textual and visual content shared in the channels +discovered through the topic modeling, and propose a conceptual framework for +the analysis of textual and visual discursive strategies in the communication +of conspiracy theories. We apply the framework in a case study of the topic +group Israel Gaza. + +
+
+ comment: 12 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ An Undetectable Watermark for Generative Image Models ICLR 2025 + + +
+ We present the first undetectable watermarking scheme for generative image +models. Undetectability ensures that no efficient adversary can distinguish +between watermarked and un-watermarked images, even after making many adaptive +queries. In particular, an undetectable watermark does not degrade image +quality under any efficiently computable metric. Our scheme works by selecting +the initial latents of a diffusion model using a pseudorandom error-correcting +code (Christ and Gunn, 2024), a strategy which guarantees undetectability and +robustness. We experimentally demonstrate that our watermarks are +quality-preserving and robust using Stable Diffusion 2.1. Our experiments +verify that, in contrast to every prior scheme we tested, our watermark does +not degrade image quality. Our experiments also demonstrate robustness: +existing watermark removal attacks fail to remove our watermark from images +without significantly degrading the quality of the images. Finally, we find +that we can robustly encode 512 bits in our watermark, and up to 2500 bits when +the images are not subjected to watermark removal attacks. Our code is +available at https://github.com/XuandongZhao/PRC-Watermark. + +
+
+ comment: ICLR 2025 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 4 + +
+
+
+ + ♻ ☆ 3D-AffordanceLLM: Harnessing Large Language Models for Open-Vocabulary + Affordance Detection in 3D Worlds ICLR + + +
+ 3D Affordance detection is a challenging problem with broad applications on +various robotic tasks. Existing methods typically formulate the detection +paradigm as a label-based semantic segmentation task. This paradigm relies on +predefined labels and lacks the ability to comprehend complex natural language, +resulting in limited generalization in open-world scene. To address these +limitations, we reformulate the traditional affordance detection paradigm into +\textit{Instruction Reasoning Affordance Segmentation} (IRAS) task. This task +is designed to output a affordance mask region given a query reasoning text, +which avoids fixed categories of input labels. We accordingly propose the +\textit{3D-AffordanceLLM} (3D-ADLLM), a framework designed for reasoning +affordance detection in 3D open-scene. Specifically, 3D-ADLLM introduces large +language models (LLMs) to 3D affordance perception with a custom-designed +decoder for generating affordance masks, thus achieving open-world reasoning +affordance detection. In addition, given the scarcity of 3D affordance datasets +for training large models, we seek to extract knowledge from general +segmentation data and transfer it to affordance detection. Thus, we propose a +multi-stage training strategy that begins with a novel pre-training task, i.e., +\textit{Referring Object Part Segmentation}~(ROPS). This stage is designed to +equip the model with general recognition and segmentation capabilities at the +object-part level. Then followed by fine-tuning with the IRAS task, 3D-ADLLM +obtains the reasoning ability for affordance detection. In summary, 3D-ADLLM +leverages the rich world knowledge and human-object interaction reasoning +ability of LLMs, achieving approximately an 8\% improvement in mIoU on +open-vocabulary affordance detection tasks. + +
+
+ comment: ICLR +
+
+
+
+
+ + ♻ ☆ A Survey on Vision-Language-Action Models for Embodied AI + + +
+ Embodied AI is widely recognized as a key element of artificial general +intelligence because it involves controlling embodied agents to perform tasks +in the physical world. Building on the success of large language models and +vision-language models, a new category of multimodal models -- referred to as +vision-language-action models (VLAs) -- has emerged to address +language-conditioned robotic tasks in embodied AI by leveraging their distinct +ability to generate actions. In recent years, a myriad of VLAs have been +developed, making it imperative to capture the rapidly evolving landscape +through a comprehensive survey. To this end, we present the first survey on +VLAs for embodied AI. This work provides a detailed taxonomy of VLAs, organized +into three major lines of research. The first line focuses on individual +components of VLAs. The second line is dedicated to developing control policies +adept at predicting low-level actions. The third line comprises high-level task +planners capable of decomposing long-horizon tasks into a sequence of subtasks, +thereby guiding VLAs to follow more general user instructions. Furthermore, we +provide an extensive summary of relevant resources, including datasets, +simulators, and benchmarks. Finally, we discuss the challenges faced by VLAs +and outline promising future directions in embodied AI. We have created a +project associated with this survey, which is available at +https://github.com/yueen-ma/Awesome-VLA. + +
+
+ comment: Project page: https://github.com/yueen-ma/Awesome-VLA +
+
+
+
+
+ + ♻ ☆ WalnutData: A UAV Remote Sensing Dataset of Green Walnuts and Model + Evaluation + + +
+ The UAV technology is gradually maturing and can provide extremely powerful +support for smart agriculture and precise monitoring. Currently, there is no +dataset related to green walnuts in the field of agricultural computer vision. +Thus, in order to promote the algorithm design in the field of agricultural +computer vision, we used UAV to collect remote-sensing data from 8 walnut +sample plots. Considering that green walnuts are subject to various lighting +conditions and occlusion, we constructed a large-scale dataset with a +higher-granularity of target features - WalnutData. This dataset contains a +total of 30,240 images and 706,208 instances, and there are 4 target +categories: being illuminated by frontal light and unoccluded (A1), being +backlit and unoccluded (A2), being illuminated by frontal light and occluded +(B1), and being backlit and occluded (B2). Subsequently, we evaluated many +mainstream algorithms on WalnutData and used these evaluation results as the +baseline standard. The dataset and all evaluation results can be obtained at +https://github.com/1wuming/WalnutData. + +
+
+
+
+
+ + ♻ ☆ LocoVR: Multiuser Indoor Locomotion Dataset in Virtual Reality ICLR2025 + + +
+ Understanding human locomotion is crucial for AI agents such as robots, +particularly in complex indoor home environments. Modeling human trajectories +in these spaces requires insight into how individuals maneuver around physical +obstacles and manage social navigation dynamics. These dynamics include subtle +behaviors influenced by proxemics - the social use of space, such as stepping +aside to allow others to pass or choosing longer routes to avoid collisions. +Previous research has developed datasets of human motion in indoor scenes, but +these are often limited in scale and lack the nuanced social navigation +dynamics common in home environments. To address this, we present LocoVR, a +dataset of 7000+ two-person trajectories captured in virtual reality from over +130 different indoor home environments. LocoVR provides accurate trajectory +data and precise spatial information, along with rich examples of +socially-motivated movement behaviors. For example, the dataset captures +instances of individuals navigating around each other in narrow spaces, +adjusting paths to respect personal boundaries in living areas, and +coordinating movements in high-traffic zones like entryways and kitchens. Our +evaluation shows that LocoVR significantly enhances model performance in three +practical indoor tasks utilizing human trajectories, and demonstrates +predicting socially-aware navigation patterns in home environments. + +
+
+ comment: This paper has been accepted to ICLR2025 +
+
+
+
+
+
+
+
+ + Artificial Intelligence 2 + +
+
+
+ + ♻ ☆ Variational Best-of-N Alignment ICLR 2025 + + +
+ Best-of-N (BoN) is a popular and effective algorithm for aligning language +models to human preferences. The algorithm works as follows: at inference time, +N samples are drawn from the language model, and the sample with the highest +reward, as judged by a reward model, is returned as the output. Despite its +effectiveness, BoN is computationally expensive; it reduces sampling throughput +by a factor of N. To make BoN more efficient at inference time, one strategy is +to fine-tune the language model to mimic what BoN does during inference. To +achieve this, we derive the distribution induced by the BoN algorithm. We then +propose to fine-tune the language model to minimize backward KL divergence to +the BoN distribution. Our approach is analogous to mean-field variational +inference and, thus, we term it variational BoN (vBoN). To the extent this +fine-tuning is successful and we end up with a good approximation, we have +reduced the inference cost by a factor of N. Our experiments on controlled +generation and summarization tasks show that BoN is the most effective +alignment method, and our variational approximation to BoN achieves the closest +performance to BoN and surpasses models fine-tuned using the standard +KL-constrained RL objective. In the controlled generation task, vBoN appears +more frequently on the Pareto frontier of reward and KL divergence compared to +other alignment methods. In the summarization task, vBoN achieves high reward +values across various sampling temperatures. + +
+
+ comment: Accepted at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ WalnutData: A UAV Remote Sensing Dataset of Green Walnuts and Model + Evaluation + + +
+ The UAV technology is gradually maturing and can provide extremely powerful +support for smart agriculture and precise monitoring. Currently, there is no +dataset related to green walnuts in the field of agricultural computer vision. +Thus, in order to promote the algorithm design in the field of agricultural +computer vision, we used UAV to collect remote-sensing data from 8 walnut +sample plots. Considering that green walnuts are subject to various lighting +conditions and occlusion, we constructed a large-scale dataset with a +higher-granularity of target features - WalnutData. This dataset contains a +total of 30,240 images and 706,208 instances, and there are 4 target +categories: being illuminated by frontal light and unoccluded (A1), being +backlit and unoccluded (A2), being illuminated by frontal light and occluded +(B1), and being backlit and occluded (B2). Subsequently, we evaluated many +mainstream algorithms on WalnutData and used these evaluation results as the +baseline standard. The dataset and all evaluation results can be obtained at +https://github.com/1wuming/WalnutData. + +
+
+
+
+
+
+
+
+ + Machine Learning 2 + +
+
+
+ + ♻ ☆ Variational Best-of-N Alignment ICLR 2025 + + +
+ Best-of-N (BoN) is a popular and effective algorithm for aligning language +models to human preferences. The algorithm works as follows: at inference time, +N samples are drawn from the language model, and the sample with the highest +reward, as judged by a reward model, is returned as the output. Despite its +effectiveness, BoN is computationally expensive; it reduces sampling throughput +by a factor of N. To make BoN more efficient at inference time, one strategy is +to fine-tune the language model to mimic what BoN does during inference. To +achieve this, we derive the distribution induced by the BoN algorithm. We then +propose to fine-tune the language model to minimize backward KL divergence to +the BoN distribution. Our approach is analogous to mean-field variational +inference and, thus, we term it variational BoN (vBoN). To the extent this +fine-tuning is successful and we end up with a good approximation, we have +reduced the inference cost by a factor of N. Our experiments on controlled +generation and summarization tasks show that BoN is the most effective +alignment method, and our variational approximation to BoN achieves the closest +performance to BoN and surpasses models fine-tuned using the standard +KL-constrained RL objective. In the controlled generation task, vBoN appears +more frequently on the Pareto frontier of reward and KL divergence compared to +other alignment methods. In the summarization task, vBoN achieves high reward +values across various sampling temperatures. + +
+
+ comment: Accepted at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ UMGAD: Unsupervised Multiplex Graph Anomaly Detection + + +
+ Graph anomaly detection (GAD) is a critical task in graph machine learning, +with the primary objective of identifying anomalous nodes that deviate +significantly from the majority. This task is widely applied in various +real-world scenarios, including fraud detection and social network analysis. +However, existing GAD methods still face two major challenges: (1) They are +often limited to detecting anomalies in single-type interaction graphs and +struggle with multiple interaction types in multiplex heterogeneous graphs. (2) +In unsupervised scenarios, selecting appropriate anomaly score thresholds +remains a significant challenge for accurate anomaly detection. To address the +above challenges, we propose a novel Unsupervised Multiplex Graph Anomaly +Detection method, named UMGAD. We first learn multi-relational correlations +among nodes in multiplex heterogeneous graphs and capture anomaly information +during node attribute and structure reconstruction through graph-masked +autoencoder (GMAE). Then, to further extract abnormal information, we generate +attribute-level and subgraph-level augmented-view graphs respectively, and +perform attribute and structure reconstruction through GMAE. Finally, we learn +to optimize node attributes and structural features through contrastive +learning between original-view and augmented-view graphs to improve the model's +ability to capture anomalies. Meanwhile, we also propose a new anomaly score +threshold selection strategy, which allows the model to be independent of +ground truth information in real unsupervised scenarios. Extensive experiments +on four datasets show that our UMGAD significantly outperforms state-of-the-art +methods, achieving average improvements of 13.48% in AUC and 11.68% in Macro-F1 +across all datasets. + +
+
+
+
+
+
+
+
+ + Multimedia 7 + +
+
+
+ + ☆ A Multimodal Symphony: Integrating Taste and Sound through Generative AI + + +
+ In recent decades, neuroscientific and psychological research has traced +direct relationships between taste and auditory perceptions. This article +explores multimodal generative models capable of converting taste information +into music, building on this foundational research. We provide a brief review +of the state of the art in this field, highlighting key findings and +methodologies. We present an experiment in which a fine-tuned version of a +generative music model (MusicGEN) is used to generate music based on detailed +taste descriptions provided for each musical piece. The results are promising: +according the participants' ($n=111$) evaluation, the fine-tuned model produces +music that more coherently reflects the input taste descriptions compared to +the non-fine-tuned model. This study represents a significant step towards +understanding and developing embodied interactions between AI, sound, and +taste, opening new possibilities in the field of generative AI. We release our +dataset, code and pre-trained model at: https://osf.io/xs5jy/. + +
+
+ comment: 17 pages, 6 figures (2 + 2 figures with 2 subfigures each) +
+
+
+
+
+ + 2DGS-Avatar: Animatable High-fidelity Clothed Avatar via 2D Gaussian + Splatting + + +
+ Real-time rendering of high-fidelity and animatable avatars from monocular +videos remains a challenging problem in computer vision and graphics. Over the +past few years, the Neural Radiance Field (NeRF) has made significant progress +in rendering quality but behaves poorly in run-time performance due to the low +efficiency of volumetric rendering. Recently, methods based on 3D Gaussian +Splatting (3DGS) have shown great potential in fast training and real-time +rendering. However, they still suffer from artifacts caused by inaccurate +geometry. To address these problems, we propose 2DGS-Avatar, a novel approach +based on 2D Gaussian Splatting (2DGS) for modeling animatable clothed avatars +with high-fidelity and fast training performance. Given monocular RGB videos as +input, our method generates an avatar that can be driven by poses and rendered +in real-time. Compared to 3DGS-based methods, our 2DGS-Avatar retains the +advantages of fast training and rendering while also capturing detailed, +dynamic, and photo-realistic appearances. We conduct abundant experiments on +popular datasets such as AvatarRex and THuman4.0, demonstrating impressive +performance in both qualitative and quantitative metrics. + +
+
+ comment: ICVRV 2024 +
+
+
+
+
+ + ☆ Audio-Reasoner: Improving Reasoning Capability in Large Audio Language + Models + + +
+ Recent advancements in multimodal reasoning have largely overlooked the audio +modality. We introduce Audio-Reasoner, a large-scale audio language model for +deep reasoning in audio tasks. We meticulously curated a large-scale and +diverse multi-task audio dataset with simple annotations. Then, we leverage +closed-source models to conduct secondary labeling, QA generation, along with +structured COT process. These datasets together form a high-quality reasoning +dataset with 1.2 million reasoning-rich samples, which we name CoTA. Following +inference scaling principles, we train Audio-Reasoner on CoTA, enabling it to +achieve great logical capabilities in audio reasoning. Experiments show +state-of-the-art performance across key benchmarks, including MMAU-mini +(+25.42%), AIR-Bench chat/foundation(+14.57%/+10.13%), and MELD (+8.01%). Our +findings stress the core of structured CoT training in advancing audio +reasoning. + +
+
+ comment: Technical report, in process +
+
+
+
+
+ + ☆ Words or Vision: Do Vision-Language Models Have Blind Faith in Text? CVPR 2025 + + +
+ Vision-Language Models (VLMs) excel in integrating visual and textual +information for vision-centric tasks, but their handling of inconsistencies +between modalities is underexplored. We investigate VLMs' modality preferences +when faced with visual data and varied textual inputs in vision-centered +settings. By introducing textual variations to four vision-centric tasks and +evaluating ten Vision-Language Models (VLMs), we discover a \emph{``blind faith +in text''} phenomenon: VLMs disproportionately trust textual data over visual +data when inconsistencies arise, leading to significant performance drops under +corrupted text and raising safety concerns. We analyze factors influencing this +text bias, including instruction prompts, language model size, text relevance, +token order, and the interplay between visual and textual certainty. While +certain factors, such as scaling up the language model size, slightly mitigate +text bias, others like token order can exacerbate it due to positional biases +inherited from language models. To address this issue, we explore supervised +fine-tuning with text augmentation and demonstrate its effectiveness in +reducing text bias. Additionally, we provide a theoretical analysis suggesting +that the blind faith in text phenomenon may stem from an imbalance of pure text +and multi-modal data during training. Our findings highlight the need for +balanced training and careful consideration of modality interactions in VLMs to +enhance their robustness and reliability in handling multi-modal data +inconsistencies. + +
+
+ comment: Accepted to CVPR 2025 +
+
+
+
+
+ + ♻ ☆ A Comprehensive Survey on Composed Image Retrieval + + +
+ Composed Image Retrieval (CIR) is an emerging yet challenging task that +allows users to search for target images using a multimodal query, comprising a +reference image and a modification text specifying the user's desired changes +to the reference image. Given its significant academic and practical value, CIR +has become a rapidly growing area of interest in the computer vision and +machine learning communities, particularly with the advances in deep learning. +To the best of our knowledge, there is currently no comprehensive review of CIR +to provide a timely overview of this field. Therefore, we synthesize insights +from over 120 publications in top conferences and journals, including ACM TOIS, +SIGIR, and CVPR In particular, we systematically categorize existing supervised +CIR and zero-shot CIR models using a fine-grained taxonomy. For a comprehensive +review, we also briefly discuss approaches for tasks closely related to CIR, +such as attribute-based CIR and dialog-based CIR. Additionally, we summarize +benchmark datasets for evaluation and analyze existing supervised and zero-shot +CIR methods by comparing experimental results across multiple datasets. +Furthermore, we present promising future directions in this field, offering +practical insights for researchers interested in further exploration. The +curated collection of related works is maintained and continuously updated in +https://github.com/haokunwen/Awesome-Composed-Image-Retrieval. + +
+
+
+
+
+ + ♻ ☆ AdaMesh: Personalized Facial Expressions and Head Poses for Adaptive + Speech-Driven 3D Facial Animation + + +
+ Speech-driven 3D facial animation aims at generating facial movements that +are synchronized with the driving speech, which has been widely explored +recently. Existing works mostly neglect the person-specific talking style in +generation, including facial expression and head pose styles. Several works +intend to capture the personalities by fine-tuning modules. However, limited +training data leads to the lack of vividness. In this work, we propose AdaMesh, +a novel adaptive speech-driven facial animation approach, which learns the +personalized talking style from a reference video of about 10 seconds and +generates vivid facial expressions and head poses. Specifically, we propose +mixture-of-low-rank adaptation (MoLoRA) to fine-tune the expression adapter, +which efficiently captures the facial expression style. For the personalized +pose style, we propose a pose adapter by building a discrete pose prior and +retrieving the appropriate style embedding with a semantic-aware pose style +matrix without fine-tuning. Extensive experimental results show that our +approach outperforms state-of-the-art methods, preserves the talking style in +the reference video, and generates vivid facial animation. The supplementary +video and code will be available at https://adamesh.github.io. + +
+
+ comment: Accepted by IEEE Transactions on Multimedia +
+
+
+
+
+ + ♻ ☆ Modular Conversational Agents for Surveys and Interviews + + +
+ Surveys and interviews are widely used for collecting insights on emerging or +hypothetical scenarios. Traditional human-led methods often face challenges +related to cost, scalability, and consistency. Recently, various domains have +begun to explore the use of conversational agents (chatbots) powered by +generative artificial intelligence (AI) technologies. However, considering +decisions in transportation investments and policies often carry significant +public and environmental stakes, surveys and interviews face unique challenges +in integrating AI agents, underscoring the need for a rigorous, +resource-efficient approach that enhances participant engagement and ensures +privacy. This paper addresses this gap by introducing a modular approach and +its resulting parameterized process for designing AI agents. We detail the +system architecture, integrating engineered prompts, specialized knowledge +bases, and customizable, goal-oriented conversational logic. We demonstrate the +adaptability, generalizability, and efficacy of our modular approach through +three empirical studies: (1) travel preference surveys, highlighting +conditional logic and multimodal (voice, text, and image generation) +capabilities; (2) public opinion elicitation on a newly constructed, novel +infrastructure project, showcasing question customization and multilingual +(English and French) capabilities; and (3) expert consultation about the impact +of technologies on future transportation systems, highlighting real-time, +clarification request capabilities for open-ended questions, resilience in +handling erratic inputs, and efficient transcript postprocessing. The results +suggest that the AI agent increases completion rates and response quality. +Furthermore, the modular approach demonstrates controllability, flexibility, +and robustness while addressing key ethical, privacy, security, and token +consumption concerns. + +
+
+
+
+
+
+
+
+ + Genomics 3 + +
+
+
+ + ☆ Enabling Fast, Accurate, and Efficient Real-Time Genome Analysis via New + Algorithms and Techniques + + +
+ The advent of high-throughput sequencing technologies has revolutionized +genome analysis by enabling the rapid and cost-effective sequencing of large +genomes. Despite these advancements, the increasing complexity and volume of +genomic data present significant challenges related to accuracy, scalability, +and computational efficiency. These challenges are mainly due to various forms +of unwanted and unhandled variations in sequencing data, collectively referred +to as noise. In this dissertation, we address these challenges by providing a +deep understanding of different types of noise in genomic data and developing +techniques to mitigate the impact of noise on genome analysis. + First, we introduce BLEND, a noise-tolerant hashing mechanism that quickly +identifies both exactly matching and highly similar sequences with arbitrary +differences using a single lookup of their hash values. Second, to enable +scalable and accurate analysis of noisy raw nanopore signals, we propose +RawHash, a novel mechanism that effectively reduces noise in raw nanopore +signals and enables accurate, real-time analysis by proposing the first +hash-based similarity search technique for raw nanopore signals. Third, we +extend the capabilities of RawHash with RawHash2, an improved mechanism that 1) +provides a better understanding of noise in raw nanopore signals to reduce it +more effectively and 2) improves the robustness of mapping decisions. Fourth, +we explore the broader implications and new applications of raw nanopore signal +analysis by introducing Rawsamble, the first mechanism for all-vs-all +overlapping of raw signals using hash-based search. Rawsamble enables the +construction of de novo assemblies directly from raw signals without +basecalling, which opens up new directions and uses for raw nanopore signal +analysis. + +
+
+ comment: PhD Thesis submitted to ETH Zurich +
+
+
+
+
+ + ☆ A Phylogenetic Approach to Genomic Language Modeling + + +
+ Genomic language models (gLMs) have shown mostly modest success in +identifying evolutionarily constrained elements in mammalian genomes. To +address this issue, we introduce a novel framework for training gLMs that +explicitly models nucleotide evolution on phylogenetic trees using multispecies +whole-genome alignments. Our approach integrates an alignment into the loss +function during training but does not require it for making predictions, +thereby enhancing the model's applicability. We applied this framework to train +PhyloGPN, a model that excels at predicting functionally disruptive variants +from a single sequence alone and demonstrates strong transfer learning +capabilities. + +
+
+ comment: 15 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ RNA-FrameFlow: Flow Matching for de novo 3D RNA Backbone Design ICML 2024 + + +
+ We introduce RNA-FrameFlow, the first generative model for 3D RNA backbone +design. We build upon SE(3) flow matching for protein backbone generation and +establish protocols for data preparation and evaluation to address unique +challenges posed by RNA modeling. We formulate RNA structures as a set of +rigid-body frames and associated loss functions which account for larger, more +conformationally flexible RNA backbones (13 atoms per nucleotide) vs. proteins +(4 atoms per residue). Toward tackling the lack of diversity in 3D RNA +datasets, we explore training with structural clustering and cropping +augmentations. Additionally, we define a suite of evaluation metrics to measure +whether the generated RNA structures are globally self-consistent (via inverse +folding followed by forward folding) and locally recover RNA-specific +structural descriptors. The most performant version of RNA-FrameFlow generates +locally realistic RNA backbones of 40-150 nucleotides, over 40% of which pass +our validity criteria as measured by a self-consistency TM-score >= 0.45, at +which two RNAs have the same global fold. Open-source code: +https://github.com/rish-16/rna-backbone-design + +
+
+ comment: Oral presentation at Machine Learning in Computational Biology + (MLCB), 2024. Also presented as an Oral at ICML 2024 Structured Probabilistic + Inference & Generative Modeling Workshop, and a Spotlight at ICML 2024 + AI4Science Workshop +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 62 + +
+
+
+ + ♻ ☆ Mitigating Hallucinations in Large Vision-Language Models via DPO: + On-Policy Data Hold the Key CVPR 2025 + + +
+ Hallucination remains a major challenge for Large Vision-Language Models +(LVLMs). Direct Preference Optimization (DPO) has gained increasing attention +as a simple solution to hallucination issues. It directly learns from +constructed preference pairs that reflect the severity of hallucinations in +responses to the same prompt and image. Nonetheless, different data +construction methods in existing works bring notable performance variations. We +identify a crucial factor here: outcomes are largely contingent on whether the +constructed data aligns on-policy w.r.t the initial (reference) policy of DPO. +Theoretical analysis suggests that learning from off-policy data is impeded by +the presence of KL-divergence between the updated policy and the reference +policy. From the perspective of dataset distribution, we systematically +summarize the inherent flaws in existing algorithms that employ DPO to address +hallucination issues. To alleviate the problems, we propose On-Policy Alignment +(OPA)-DPO framework, which uniquely leverages expert feedback to correct +hallucinated responses and aligns both the original and expert-revised +responses in an on-policy manner. Notably, with only 4.8k data, OPA-DPO +achieves an additional reduction in the hallucination rate of LLaVA-1.5-7B: +13.26% on the AMBER benchmark and 5.39% on the Object-Hal benchmark, compared +to the previous SOTA algorithm trained with 16k samples. Our implementation is +available at https://github.com/zhyang2226/OPA-DPO. + +
+
+ comment: Accepted by CVPR 2025 +
+
+
+
+
+ + ♻ ☆ Stereo Hand-Object Reconstruction for Human-to-Robot Handover + + +
+ Jointly estimating hand and object shape facilitates the grasping task in +human-to-robot handovers. However, relying on hand-crafted prior knowledge +about the geometric structure of the object fails when generalising to unseen +objects, and depth sensors fail to detect transparent objects such as drinking +glasses. In this work, we propose a stereo-based method for hand-object +reconstruction that combines single-view reconstructions probabilistically to +form a coherent stereo reconstruction. We learn 3D shape priors from a large +synthetic hand-object dataset to ensure that our method is generalisable, and +use RGB inputs to better capture transparent objects. We show that our method +reduces the object Chamfer distance compared to existing RGB based hand-object +reconstruction methods on single view and stereo settings. We process the +reconstructed hand-object shape with a projection-based outlier removal step +and use the output to guide a human-to-robot handover pipeline with +wide-baseline stereo RGB cameras. Our hand-object reconstruction enables a +robot to successfully receive a diverse range of household objects from the +human. + +
+
+ comment: 8 pages, 9 figures, 1 table +
+
+
+
+
+ + ♻ ☆ EchoONE: Segmenting Multiple echocardiography Planes in One Model CVPR 2025 + + +
+ In clinical practice of echocardiography examinations, multiple planes +containing the heart structures of different view are usually required in +screening, diagnosis and treatment of cardiac disease. AI models for +echocardiography have to be tailored for each specific plane due to the +dramatic structure differences, thus resulting in repetition development and +extra complexity. Effective solution for such a multi-plane segmentation (MPS) +problem is highly demanded for medical images, yet has not been well +investigated. In this paper, we propose a novel solution, EchoONE, for this +problem with a SAM-based segmentation architecture, a prior-composable mask +learning (PC-Mask) module for semantic-aware dense prompt generation, and a +learnable CNN-branch with a simple yet effective local feature fusion and +adaption (LFFA) module for SAM adapting. We extensively evaluated our method on +multiple internal and external echocardiography datasets, and achieved +consistently state-of-the-art performance for multi-source datasets with +different heart planes. This is the first time that the MPS problem is solved +in one model for echocardiography data. The code will be available at +https://github.com/a2502503/EchoONE. + +
+
+ comment: Accepted by CVPR 2025 +
+
+
+
+
+ + ♻ ☆ Evaluating Intelligence via Trial and Error + + +
+ Intelligence is a crucial trait for species to find solutions within a +limited number of trial-and-error attempts. Building on this idea, we introduce +Survival Game as a framework to evaluate intelligence based on the number of +failed attempts in a trial-and-error process. Fewer failures indicate higher +intelligence. When the expectation and variance of failure counts are both +finite, it signals the ability to consistently find solutions to new +challenges, which we define as the Autonomous Level of intelligence. Using +Survival Game, we comprehensively evaluate existing AI systems. Our results +show that while AI systems achieve the Autonomous Level in simple tasks, they +are still far from it in more complex tasks, such as vision, search, +recommendation, and language. While scaling current AI technologies might help, +this would come at an astronomical cost. Projections suggest that achieving the +Autonomous Level for general tasks would require $10^{26}$ parameters. To put +this into perspective, loading such a massive model requires so many H100 GPUs +that their total value is $10^{7}$ times that of Apple Inc.'s market value. +Even with Moore's Law, supporting such a parameter scale would take $70$ years. +This staggering cost highlights the complexity of human tasks and the +inadequacies of current AI technologies. To further investigate this +phenomenon, we conduct a theoretical analysis of Survival Game and its +experimental results. Our findings suggest that human tasks possess a +criticality property. As a result, Autonomous Level requires a deep +understanding of the task's underlying mechanisms. Current AI systems, however, +do not fully grasp these mechanisms and instead rely on superficial mimicry, +making it difficult for them to reach an autonomous level. We believe Survival +Game can not only guide the future development of AI but also offer profound +insights into human intelligence. + +
+
+
+
+
+ + ♻ ☆ MATCH POLICY: A Simple Pipeline from Point Cloud Registration to + Manipulation Policies + + +
+ Many manipulation tasks require the robot to rearrange objects relative to +one another. Such tasks can be described as a sequence of relative poses +between parts of a set of rigid bodies. In this work, we propose MATCH POLICY, +a simple but novel pipeline for solving high-precision pick and place tasks. +Instead of predicting actions directly, our method registers the pick and place +targets to the stored demonstrations. This transfers action inference into a +point cloud registration task and enables us to realize nontrivial manipulation +policies without any training. MATCH POLICY is designed to solve high-precision +tasks with a key-frame setting. By leveraging the geometric interaction and the +symmetries of the task, it achieves extremely high sample efficiency and +generalizability to unseen configurations. We demonstrate its state-of-the-art +performance across various tasks on RLBench benchmark compared with several +strong baselines and test it on a real robot with six tasks. + +
+
+ comment: project url: https://haojhuang.github.io/match_page/ +
+
+
+
+
+ + ♻ ☆ Annotation-Free Curb Detection Leveraging Altitude Difference Image + + +
+ Road curbs are considered as one of the crucial and ubiquitous traffic +features, which are essential for ensuring the safety of autonomous vehicles. +Current methods for detecting curbs primarily rely on camera imagery or LiDAR +point clouds. Image-based methods are vulnerable to fluctuations in lighting +conditions and exhibit poor robustness, while methods based on point clouds +circumvent the issues associated with lighting variations. However, it is the +typical case that significant processing delays are encountered due to the +voluminous amount of 3D points contained in each frame of the point cloud data. +Furthermore, the inherently unstructured characteristics of point clouds poses +challenges for integrating the latest deep learning advancements into point +cloud data applications. To address these issues, this work proposes an +annotation-free curb detection method leveraging Altitude Difference Image +(ADI), which effectively mitigates the aforementioned challenges. Given that +methods based on deep learning generally demand extensive, manually annotated +datasets, which are both expensive and labor-intensive to create, we present an +Automatic Curb Annotator (ACA) module. This module utilizes a deterministic +curb detection algorithm to automatically generate a vast quantity of training +data. Consequently, it facilitates the training of the curb detection model +without necessitating any manual annotation of data. Finally, by incorporating +a post-processing module, we manage to achieve state-of-the-art results on the +KITTI 3D curb dataset with considerably reduced processing delays compared to +existing methods, which underscores the effectiveness of our approach in curb +detection tasks. + +
+
+
+
+
+ + ♻ ☆ Text-driven Adaptation of Foundation Models for Few-shot Surgical + Workflow Analysis + + +
+ Purpose: Surgical workflow analysis is crucial for improving surgical +efficiency and safety. However, previous studies rely heavily on large-scale +annotated datasets, posing challenges in cost, scalability, and reliance on +expert annotations. To address this, we propose Surg-FTDA (Few-shot Text-driven +Adaptation), designed to handle various surgical workflow analysis tasks with +minimal paired image-label data. + Methods: Our approach has two key components. First, Few-shot selection-based +modality alignment selects a small subset of images and aligns their embeddings +with text embeddings from the downstream task, bridging the modality gap. +Second, Text-driven adaptation leverages only text data to train a decoder, +eliminating the need for paired image-text data. This decoder is then applied +to aligned image embeddings, enabling image-related tasks without explicit +image-text pairs. + Results: We evaluate our approach to generative tasks (image captioning) and +discriminative tasks (triplet recognition and phase recognition). Results show +that Surg-FTDA outperforms baselines and generalizes well across downstream +tasks. + Conclusion: We propose a text-driven adaptation approach that mitigates the +modality gap and handles multiple downstream tasks in surgical workflow +analysis, with minimal reliance on large annotated datasets. The code and +dataset will be released in https://github.com/CAMMA-public/Surg-FTDA + +
+
+
+
+
+ + ♻ ☆ NavRAG: Generating User Demand Instructions for Embodied Navigation + through Retrieval-Augmented LLM + + +
+ Vision-and-Language Navigation (VLN) is an essential skill for embodied +agents, allowing them to navigate in 3D environments following natural language +instructions. High-performance navigation models require a large amount of +training data, the high cost of manually annotating data has seriously hindered +this field. Therefore, some previous methods translate trajectory videos into +step-by-step instructions for expanding data, but such instructions do not +match well with users' communication styles that briefly describe destinations +or state specific needs. Moreover, local navigation trajectories overlook +global context and high-level task planning. To address these issues, we +propose NavRAG, a retrieval-augmented generation (RAG) framework that generates +user demand instructions for VLN. NavRAG leverages LLM to build a hierarchical +scene description tree for 3D scene understanding from global layout to local +details, then simulates various user roles with specific demands to retrieve +from the scene tree, generating diverse instructions with LLM. We annotate over +2 million navigation instructions across 861 scenes and evaluate the data +quality and navigation performance of trained models. + +
+
+
+
+
+ + ♻ ☆ Monocular Depth Estimation and Segmentation for Transparent Object with + Iterative Semantic and Geometric Fusion ICRA + + +
+ Transparent object perception is indispensable for numerous robotic tasks. +However, accurately segmenting and estimating the depth of transparent objects +remain challenging due to complex optical properties. Existing methods +primarily delve into only one task using extra inputs or specialized sensors, +neglecting the valuable interactions among tasks and the subsequent refinement +process, leading to suboptimal and blurry predictions. To address these issues, +we propose a monocular framework, which is the first to excel in both +segmentation and depth estimation of transparent objects, with only a +single-image input. Specifically, we devise a novel semantic and geometric +fusion module, effectively integrating the multi-scale information between +tasks. In addition, drawing inspiration from human perception of objects, we +further incorporate an iterative strategy, which progressively refines initial +features for clearer results. Experiments on two challenging synthetic and +real-world datasets demonstrate that our model surpasses state-of-the-art +monocular, stereo, and multi-view methods by a large margin of about +38.8%-46.2% with only a single RGB input. Codes and models are publicly +available at https://github.com/L-J-Yuan/MODEST. + +
+
+ comment: Accepted by ICRA(2025). The code is accessible through: + https://github.com/L-J-Yuan/MODEST +
+
+
+
+
+ + ♻ ☆ HiLo: A Learning Framework for Generalized Category Discovery Robust to + Domain Shifts ICLR 2025 + + +
+ Generalized Category Discovery (GCD) is a challenging task in which, given a +partially labelled dataset, models must categorize all unlabelled instances, +regardless of whether they come from labelled categories or from new ones. In +this paper, we challenge a remaining assumption in this task: that all images +share the same domain. Specifically, we introduce a new task and method to +handle GCD when the unlabelled data also contains images from different domains +to the labelled set. Our proposed `HiLo' networks extract High-level semantic +and Low-level domain features, before minimizing the mutual information between +the representations. Our intuition is that the clusterings based on domain +information and semantic information should be independent. We further extend +our method with a specialized domain augmentation tailored for the GCD task, as +well as a curriculum learning approach. Finally, we construct a benchmark from +corrupted fine-grained datasets as well as a large-scale evaluation on +DomainNet with real-world domain shifts, reimplementing a number of GCD +baselines in this setting. We demonstrate that HiLo outperforms SoTA category +discovery models by a large margin on all evaluations. + +
+
+ comment: v2: Accepted as a conference paper at ICLR 2025; Project page: + https://github.com/Visual-AI/hilo/ +
+
+
+
+
+ + ♻ ☆ CtrLoRA: An Extensible and Efficient Framework for Controllable Image + Generation ICLR 2025 + + +
+ Recently, large-scale diffusion models have made impressive progress in +text-to-image (T2I) generation. To further equip these T2I models with +fine-grained spatial control, approaches like ControlNet introduce an extra +network that learns to follow a condition image. However, for every single +condition type, ControlNet requires independent training on millions of data +pairs with hundreds of GPU hours, which is quite expensive and makes it +challenging for ordinary users to explore and develop new types of conditions. +To address this problem, we propose the CtrLoRA framework, which trains a Base +ControlNet to learn the common knowledge of image-to-image generation from +multiple base conditions, along with condition-specific LoRAs to capture +distinct characteristics of each condition. Utilizing our pretrained Base +ControlNet, users can easily adapt it to new conditions, requiring as few as +1,000 data pairs and less than one hour of single-GPU training to obtain +satisfactory results in most scenarios. Moreover, our CtrLoRA reduces the +learnable parameters by 90% compared to ControlNet, significantly lowering the +threshold to distribute and deploy the model weights. Extensive experiments on +various types of conditions demonstrate the efficiency and effectiveness of our +method. Codes and model weights will be released at +https://github.com/xyfJASON/ctrlora. + +
+
+ comment: ICLR 2025. Code: https://github.com/xyfJASON/ctrlora +
+
+
+
+
+ + ♻ ☆ Poison-splat: Computation Cost Attack on 3D Gaussian Splatting ICLR 2025 + + +
+ 3D Gaussian splatting (3DGS), known for its groundbreaking performance and +efficiency, has become a dominant 3D representation and brought progress to +many 3D vision tasks. However, in this work, we reveal a significant security +vulnerability that has been largely overlooked in 3DGS: the computation cost of +training 3DGS could be maliciously tampered by poisoning the input data. By +developing an attack named Poison-splat, we reveal a novel attack surface where +the adversary can poison the input images to drastically increase the +computation memory and time needed for 3DGS training, pushing the algorithm +towards its worst computation complexity. In extreme cases, the attack can even +consume all allocable memory, leading to a Denial-of-Service (DoS) that +disrupts servers, resulting in practical damages to real-world 3DGS service +vendors. Such a computation cost attack is achieved by addressing a bi-level +optimization problem through three tailored strategies: attack objective +approximation, proxy model rendering, and optional constrained optimization. +These strategies not only ensure the effectiveness of our attack but also make +it difficult to defend with simple defensive measures. We hope the revelation +of this novel attack surface can spark attention to this crucial yet overlooked +vulnerability of 3DGS systems. Our code is available at +https://github.com/jiahaolu97/poison-splat . + +
+
+ comment: Accepted by ICLR 2025 as a spotlight paper +
+
+
+
+
+ + ♻ ☆ FLARE: Feed-forward Geometry, Appearance and Camera Estimation from + Uncalibrated Sparse Views CVPR 2025 + + +
+ We present FLARE, a feed-forward model designed to infer high-quality camera +poses and 3D geometry from uncalibrated sparse-view images (i.e., as few as 2-8 +inputs), which is a challenging yet practical setting in real-world +applications. Our solution features a cascaded learning paradigm with camera +pose serving as the critical bridge, recognizing its essential role in mapping +3D structures onto 2D image planes. Concretely, FLARE starts with camera pose +estimation, whose results condition the subsequent learning of geometric +structure and appearance, optimized through the objectives of geometry +reconstruction and novel-view synthesis. Utilizing large-scale public datasets +for training, our method delivers state-of-the-art performance in the tasks of +pose estimation, geometry reconstruction, and novel view synthesis, while +maintaining the inference efficiency (i.e., less than 0.5 seconds). The project +page and code can be found at: https://zhanghe3z.github.io/FLARE/ + +
+
+ comment: CVPR 2025. Website: https://zhanghe3z.github.io/FLARE/ +
+
+
+
+
+ + ♻ ☆ Optimal Brain Apoptosis ICLR 2025 + + +
+ The increasing complexity and parameter count of Convolutional Neural +Networks (CNNs) and Transformers pose challenges in terms of computational +efficiency and resource demands. Pruning has been identified as an effective +strategy to address these challenges by removing redundant elements such as +neurons, channels, or connections, thereby enhancing computational efficiency +without heavily compromising performance. This paper builds on the foundational +work of Optimal Brain Damage (OBD) by advancing the methodology of parameter +importance estimation using the Hessian matrix. Unlike previous approaches that +rely on approximations, we introduce Optimal Brain Apoptosis (OBA), a novel +pruning method that calculates the Hessian-vector product value directly for +each parameter. By decomposing the Hessian matrix across network layers and +identifying conditions under which inter-layer Hessian submatrices are +non-zero, we propose a highly efficient technique for computing the +second-order Taylor expansion of parameters. This approach allows for a more +precise pruning process, particularly in the context of CNNs and Transformers, +as validated in our experiments including VGG19, ResNet32, ResNet50, and +ViT-B/16 on CIFAR10, CIFAR100 and Imagenet datasets. Our code is available at +https://github.com/NEU-REAL/OBA. + +
+
+ comment: Accepted to ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Exploring the Effectiveness of Object-Centric Representations in Visual + Question Answering: Comparative Insights with Foundation Models ICLR 2025 + + +
+ Object-centric (OC) representations, which model visual scenes as +compositions of discrete objects, have the potential to be used in various +downstream tasks to achieve systematic compositional generalization and +facilitate reasoning. However, these claims have yet to be thoroughly validated +empirically. Recently, foundation models have demonstrated unparalleled +capabilities across diverse domains, from language to computer vision, +positioning them as a potential cornerstone of future research for a wide range +of computational tasks. In this paper, we conduct an extensive empirical study +on representation learning for downstream Visual Question Answering (VQA), +which requires an accurate compositional understanding of the scene. We +thoroughly investigate the benefits and trade-offs of OC models and alternative +approaches including large pre-trained foundation models on both synthetic and +real-world data, ultimately identifying a promising path to leverage the +strengths of both paradigms. The extensiveness of our study, encompassing over +600 downstream VQA models and 15 different types of upstream representations, +also provides several additional insights that we believe will be of interest +to the community at large. + +
+
+ comment: Published at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ MIGE: A Unified Framework for Multimodal Instruction-Based Image + Generation and Editing + + +
+ Despite significant progress in diffusion-based image generation, +subject-driven generation and instruction-based editing remain challenging. +Existing methods typically treat them separately, struggling with limited +high-quality data and poor generalization. However, both tasks require +capturing complex visual variations while maintaining consistency between +inputs and outputs. Therefore, we propose MIGE, a unified framework that +standardizes task representations using multimodal instructions. It treats +subject-driven generation as creation on a blank canvas and instruction-based +editing as modification of an existing image, establishing a shared +input-output formulation. MIGE introduces a novel multimodal encoder that maps +free-form multimodal instructions into a unified vision-language space, +integrating visual and semantic features through a feature fusion mechanism. +This unification enables joint training of both tasks, providing two key +advantages: (1) Cross-Task Enhancement: By leveraging shared visual and +semantic representations, joint training improves instruction adherence and +visual consistency in both subject-driven generation and instruction-based +editing. (2) Generalization: Learning in a unified format facilitates +cross-task knowledge transfer, enabling MIGE to generalize to novel +compositional tasks, including instruction-based subject-driven editing. +Experiments show that MIGE excels in both subject-driven generation and +instruction-based editing while setting a state-of-the-art in the new task of +instruction-based subject-driven editing. Code and model have been publicly +available at https://github.com/Eureka-Maggie/MIGE. + +
+
+
+
+
+ + ♻ ☆ Adaptive Prompt: Unlocking the Power of Visual Prompt Tuning + + +
+ Visual Prompt Tuning (VPT) has recently emerged as a powerful method for +adapting pre-trained vision models to downstream tasks. By introducing +learnable prompt tokens as task-specific instructions, VPT effectively guides +pre-trained transformer models with minimal overhead. Despite its empirical +success, a comprehensive theoretical understanding of VPT remains an active +area of research. Building on recent insights into the connection between +mixture of experts and prompt-based approaches, we identify a key limitation in +VPT: the restricted functional expressiveness in prompt formulation. To address +this limitation, we propose Visual Adaptive Prompt Tuning (VAPT), a new +generation of prompts that redefines prompts as adaptive functions of the +input. Our theoretical analysis shows that this simple yet intuitive approach +achieves optimal sample efficiency. Empirical results on VTAB-1K and FGVC +further demonstrate VAPT's effectiveness, with performance gains of 7.34% and +1.04% over fully fine-tuning baselines, respectively. Notably, VAPT also +surpasses VPT by a substantial margin while using fewer parameters. These +results highlight both the effectiveness and efficiency of our method and pave +the way for future research to explore the potential of adaptive prompts. + +
+
+ comment: 57 pages, 10 figures, 18 tables +
+
+
+
+
+ + ♻ ☆ PnP-Flow: Plug-and-Play Image Restoration with Flow Matching + + +
+ In this paper, we introduce Plug-and-Play (PnP) Flow Matching, an algorithm +for solving imaging inverse problems. PnP methods leverage the strength of +pre-trained denoisers, often deep neural networks, by integrating them in +optimization schemes. While they achieve state-of-the-art performance on +various inverse problems in imaging, PnP approaches face inherent limitations +on more generative tasks like inpainting. On the other hand, generative models +such as Flow Matching pushed the boundary in image sampling yet lack a clear +method for efficient use in image restoration. We propose to combine the PnP +framework with Flow Matching (FM) by defining a time-dependent denoiser using a +pre-trained FM model. Our algorithm alternates between gradient descent steps +on the data-fidelity term, reprojections onto the learned FM path, and +denoising. Notably, our method is computationally efficient and +memory-friendly, as it avoids backpropagation through ODEs and trace +computations. We evaluate its performance on denoising, super-resolution, +deblurring, and inpainting tasks, demonstrating superior results compared to +existing PnP algorithms and Flow Matching based state-of-the-art methods. + +
+
+
+
+
+ + ♻ Meta Curvature-Aware Minimization for Domain Generalization + + +
+ Domain generalization (DG) aims to enhance the ability of models trained on +source domains to generalize effectively to unseen domains. Recently, +Sharpness-Aware Minimization (SAM) has shown promise in this area by reducing +the sharpness of the loss landscape to obtain more generalized models. However, +SAM and its variants sometimes fail to guide the model toward a flat minimum, +and their training processes exhibit limitations, hindering further +improvements in model generalization. In this paper, we first propose an +improved model training process aimed at encouraging the model to converge to a +flat minima. To achieve this, we design a curvature metric that has a minimal +effect when the model is far from convergence but becomes increasingly +influential in indicating the curvature of the minima as the model approaches a +local minimum. Then we derive a novel algorithm from this metric, called Meta +Curvature-Aware Minimization (MeCAM), to minimize the curvature around the +local minima. Specifically, the optimization objective of MeCAM simultaneously +minimizes the regular training loss, the surrogate gap of SAM, and the +surrogate gap of meta-learning. We provide theoretical analysis on MeCAM's +generalization error and convergence rate, and demonstrate its superiority over +existing DG methods through extensive experiments on five benchmark DG +datasets, including PACS, VLCS, OfficeHome, TerraIncognita, and DomainNet. Code +will be available on GitHub. + +
+
+ comment: 22 pages, 5 figures, 17 tables +
+
+
+
+
+ + ♻ ☆ Towards Training One-Step Diffusion Models Without Distillation + + +
+ Recent advances in one-step generative models typically follow a two-stage +process: first training a teacher diffusion model and then distilling it into a +one-step student model. This distillation process traditionally relies on both +the teacher model's score function to compute the distillation loss and its +weights for student initialization. In this paper, we explore whether one-step +generative models can be trained directly without this distillation process. +First, we show that the teacher's score function is not essential and propose a +family of distillation methods that achieve competitive results without relying +on score estimation. Next, we demonstrate that initialization from teacher +weights is indispensable in successful training. Surprisingly, we find that +this benefit is not due to improved ``input-output" mapping but rather the +learned feature representations, which dominate distillation quality. Our +findings provide a better understanding of the role of initialization in +one-step model training and its impact on distillation quality. + +
+
+ comment: 13 pages, Technical Report +
+
+
+
+
+ + ♻ ☆ Foundation Models -- A Panacea for Artificial Intelligence in Pathology? + + +
+ The role of artificial intelligence (AI) in pathology has evolved from aiding +diagnostics to uncovering predictive morphological patterns in whole slide +images (WSIs). Recently, foundation models (FMs) leveraging self-supervised +pre-training have been widely advocated as a universal solution for diverse +downstream tasks. However, open questions remain about their clinical +applicability and generalization advantages over end-to-end learning using +task-specific (TS) models. Here, we focused on AI with clinical-grade +performance for prostate cancer diagnosis and Gleason grading. We present the +largest validation of AI for this task, using over 100,000 core needle biopsies +from 7,342 patients across 15 sites in 11 countries. We compared two FMs with a +fully end-to-end TS model in a multiple instance learning framework. Our +findings challenge assumptions that FMs universally outperform TS models. While +FMs demonstrated utility in data-scarce scenarios, their performance converged +with - and was in some cases surpassed by - TS models when sufficient labeled +training data were available. Notably, extensive task-specific training +markedly reduced clinically significant misgrading, misdiagnosis of challenging +morphologies, and variability across different WSI scanners. Additionally, FMs +used up to 35 times more energy than the TS model, raising concerns about their +sustainability. Our results underscore that while FMs offer clear advantages +for rapid prototyping and research, their role as a universal solution for +clinically applicable medical AI remains uncertain. For high-stakes clinical +applications, rigorous validation and consideration of task-specific training +remain critically important. We advocate for integrating the strengths of FMs +and end-to-end learning to achieve robust and resource-efficient AI pathology +solutions fit for clinical use. + +
+
+ comment: 50 pages, 15 figures and an appendix (study protocol) which is + previously published, see https://doi.org/10.1101/2024.07.04.24309948; + updated authors list format +
+
+
+
+
+ + ♻ ☆ The PanAf-FGBG Dataset: Understanding the Impact of Backgrounds in + Wildlife Behaviour Recognition + + +
+ Computer vision analysis of camera trap video footage is essential for +wildlife conservation, as captured behaviours offer some of the earliest +indicators of changes in population health. Recently, several high-impact +animal behaviour datasets and methods have been introduced to encourage their +use; however, the role of behaviour-correlated background information and its +significant effect on out-of-distribution generalisation remain unexplored. In +response, we present the PanAf-FGBG dataset, featuring 20 hours of wild +chimpanzee behaviours, recorded at over 350 individual camera locations. +Uniquely, it pairs every video with a chimpanzee (referred to as a foreground +video) with a corresponding background video (with no chimpanzee) from the same +camera location. We present two views of the dataset: one with overlapping +camera locations and one with disjoint locations. This setup enables, for the +first time, direct evaluation of in-distribution and out-of-distribution +conditions, and for the impact of backgrounds on behaviour recognition models +to be quantified. All clips come with rich behavioural annotations and metadata +including unique camera IDs and detailed textual scene descriptions. +Additionally, we establish several baselines and present a highly effective +latent-space normalisation technique that boosts out-of-distribution +performance by +5.42% mAP for convolutional and +3.75% mAP for +transformer-based models. Finally, we provide an in-depth analysis on the role +of backgrounds in out-of-distribution behaviour recognition, including the so +far unexplored impact of background durations (i.e., the count of background +frames within foreground videos). + +
+
+ comment: Accepted at the IEEE / CVF Computer Vision and Pattern Recognition + Conference 2025 +
+
+
+
+
+ + ♻ ☆ TRACE: Temporal Grounding Video LLM via Causal Event Modeling ICLR 2025 + + +
+ Video Temporal Grounding (VTG) is a crucial capability for video +understanding models and plays a vital role in downstream tasks such as video +browsing and editing. To effectively handle various tasks simultaneously and +enable zero-shot prediction, there is a growing trend in employing video LLMs +for VTG tasks. However, current video LLM-based methods rely exclusively on +natural language generation, lacking the ability to model the clear structure +inherent in videos, which restricts their effectiveness in tackling VTG tasks. +To address this issue, this paper first formally introduces causal event +modeling framework, which represents video LLM outputs as sequences of events, +and predict the current event using previous events, video inputs, and textural +instructions. Each event consists of three components: timestamps, salient +scores, and textual captions. We then propose a novel task-interleaved video +LLM called TRACE to effectively implement the causal event modeling framework +in practice. The TRACE process visual frames, timestamps, salient scores, and +text as distinct tasks, employing various encoders and decoding heads for each. +Task tokens are arranged in an interleaved sequence according to the causal +event modeling framework's formulation. Extensive experiments on various VTG +tasks and datasets demonstrate the superior performance of TRACE compared to +state-of-the-art video LLMs. Our model and code are available at +https://github.com/gyxxyg/TRACE. + +
+
+ comment: ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Slowing Down Forgetting in Continual Learning + + +
+ A common challenge in continual learning (CL) is catastrophic forgetting, +where the performance on old tasks drops after new, additional tasks are +learned. In this paper, we propose a novel framework called ReCL to slow down +forgetting in CL. Our framework exploits an implicit bias of gradient-based +neural networks due to which these converge to margin maximization points. Such +convergence points allow us to reconstruct old data from previous tasks, which +we then combine with the current training data. Our framework is flexible and +can be applied on top of existing, state-of-the-art CL methods. We further +demonstrate the performance gain from our framework across a large series of +experiments, including two challenging CL scenarios (class incremental and +domain incremental learning), different datasets (MNIST, CIFAR10, +TinyImagenet), and different network architectures. Across all experiments, we +find large performance gains through ReCL. To the best of our knowledge, our +framework is the first to address catastrophic forgetting by leveraging models +in CL as their own memory buffers. + +
+
+
+
+
+ + ♻ ☆ Improving Representation of High-frequency Components for Medical Visual + Foundation Models + + +
+ Foundation models have recently attracted significant attention for their +impressive generalizability across diverse downstream tasks. However, these +models are demonstrated to exhibit great limitations in representing +high-frequency components and fine-grained details. In many medical imaging +tasks, the precise representation of such information is crucial due to the +inherently intricate anatomical structures, sub-visual features, and complex +boundaries involved. Consequently, the limited representation of prevalent +foundation models can result in significant performance degradation or even +failure in these tasks. To address these challenges, we propose a novel +pretraining strategy, named Frequency-advanced Representation Autoencoder +(Frepa). Through high-frequency masking and low-frequency perturbation combined +with adversarial learning, Frepa encourages the encoder to effectively +represent and preserve high-frequency components in the image embeddings. +Additionally, we introduce an innovative histogram-equalized image masking +strategy, extending the Masked Autoencoder approach beyond ViT to other +architectures such as Swin Transformer and convolutional networks. We develop +Frepa across nine medical modalities and validate it on 32 downstream tasks for +both 2D images and 3D volume data. Without fine-tuning, Frepa can outperform +other self-supervised pretraining methods and, in some cases, even surpasses +task-specific trained models. This improvement is particularly significant for +tasks involving fine-grained details, such as achieving up to a +15% increase +in DSC for retina vessel segmentation and a +7% increase in IoU for lung nodule +detection. Further experiments quantitatively reveal that Frepa enables +superior high-frequency representations and preservation in the embeddings, +underscoring its potential for developing more generalized and universal +medical image foundation models. + +
+
+
+
+
+ + ♻ ☆ EXACFS -- A CIL Method to mitigate Catastrophic Forgetting + + +
+ Deep neural networks (DNNS) excel at learning from static datasets but +struggle with continual learning, where data arrives sequentially. Catastrophic +forgetting, the phenomenon of forgetting previously learned knowledge, is a +primary challenge. This paper introduces EXponentially Averaged Class-wise +Feature Significance (EXACFS) to mitigate this issue in the class incremental +learning (CIL) setting. By estimating the significance of model features for +each learned class using loss gradients, gradually aging the significance +through the incremental tasks and preserving the significant features through a +distillation loss, EXACFS effectively balances remembering old knowledge +(stability) and learning new knowledge (plasticity). Extensive experiments on +CIFAR-100 and ImageNet-100 demonstrate EXACFS's superior performance in +preserving stability while acquiring plasticity. + +
+
+
+
+
+ + ♻ ☆ HiBug2: Efficient and Interpretable Error Slice Discovery for + Comprehensive Model Debugging + + +
+ Despite the significant success of deep learning models in computer vision, +they often exhibit systematic failures on specific data subsets, known as error +slices. Identifying and mitigating these error slices is crucial to enhancing +model robustness and reliability in real-world scenarios. In this paper, we +introduce HiBug2, an automated framework for error slice discovery and model +repair. HiBug2 first generates task-specific visual attributes to highlight +instances prone to errors through an interpretable and structured process. It +then employs an efficient slice enumeration algorithm to systematically +identify error slices, overcoming the combinatorial challenges that arise +during slice exploration. Additionally, HiBug2 extends its capabilities by +predicting error slices beyond the validation set, addressing a key limitation +of prior approaches. Extensive experiments across multiple domains, including +image classification, pose estimation, and object detection - show that HiBug2 +not only improves the coherence and precision of identified error slices but +also significantly enhances the model repair capabilities. + +
+
+
+
+
+ + ♻ ☆ VoCo-LLaMA: Towards Vision Compression with Large Language Models + + +
+ Vision-Language Models (VLMs) have achieved remarkable success in various +multi-modal tasks, but they are often bottlenecked by the limited context +window and high computational cost of processing high-resolution image inputs +and videos. Vision compression can alleviate this problem by reducing the +vision token count. Previous approaches compress vision tokens with external +modules and force LLMs to understand the compressed ones, leading to visual +information loss. However, the LLMs' understanding paradigm of vision tokens is +not fully utilised in the compression learning process. We propose VoCo-LLaMA, +the first approach to compress vision tokens using LLMs. By introducing Vision +Compression tokens during the vision instruction tuning phase and leveraging +attention distillation, our method distill how LLMs comprehend vision tokens +into their processing of VoCo tokens. VoCo-LLaMA facilitates effective vision +compression and improves the computational efficiency during the inference +stage. Specifically, our method achieves minimal performance loss with a +compression ratio of 576$\times$, resulting in up to 94.8$\%$ fewer FLOPs and +69.6$\%$ acceleration in inference time. Furthermore, through continuous +training using time-series compressed token sequences of video frames, +VoCo-LLaMA demonstrates the ability to understand temporal correlations, +outperforming previous methods on popular video question-answering benchmarks. +Our approach presents a promising way to unlock the full potential of VLMs' +contextual window, enabling more scalable multi-modal applications. The project +page, along with the associated code, can be accessed via +https://yxxxb.github.io/VoCo-LLaMA-page/. + +
+
+ comment: 11 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Fast and Accurate Gigapixel Pathological Image Classification with + Hierarchical Distillation Multi-Instance Learning CVPR2025 + + +
+ Although multi-instance learning (MIL) has succeeded in pathological image +classification, it faces the challenge of high inference costs due to +processing numerous patches from gigapixel whole slide images (WSIs). To +address this, we propose HDMIL, a hierarchical distillation multi-instance +learning framework that achieves fast and accurate classification by +eliminating irrelevant patches. HDMIL consists of two key components: the +dynamic multi-instance network (DMIN) and the lightweight instance +pre-screening network (LIPN). DMIN operates on high-resolution WSIs, while LIPN +operates on the corresponding low-resolution counterparts. During training, +DMIN are trained for WSI classification while generating attention-score-based +masks that indicate irrelevant patches. These masks then guide the training of +LIPN to predict the relevance of each low-resolution patch. During testing, +LIPN first determines the useful regions within low-resolution WSIs, which +indirectly enables us to eliminate irrelevant regions in high-resolution WSIs, +thereby reducing inference time without causing performance degradation. In +addition, we further design the first Chebyshev-polynomials-based +Kolmogorov-Arnold classifier in computational pathology, which enhances the +performance of HDMIL through learnable activation layers. Extensive experiments +on three public datasets demonstrate that HDMIL outperforms previous +state-of-the-art methods, e.g., achieving improvements of 3.13% in AUC while +reducing inference time by 28.6% on the Camelyon16 dataset. + +
+
+ comment: 11 pages, 4 figures, accepted by CVPR2025 +
+
+
+
+
+ + ♻ ☆ GDTS: Goal-Guided Diffusion Model with Tree Sampling for Multi-Modal + Pedestrian Trajectory Prediction + + +
+ Accurate prediction of pedestrian trajectories is crucial for improving the +safety of autonomous driving. However, this task is generally nontrivial due to +the inherent stochasticity of human motion, which naturally requires the +predictor to generate multi-modal prediction. Previous works leverage various +generative methods, such as GAN and VAE, for pedestrian trajectory prediction. +Nevertheless, these methods may suffer from mode collapse and relatively +low-quality results. The denoising diffusion probabilistic model (DDPM) has +recently been applied to trajectory prediction due to its simple training +process and powerful reconstruction ability. However, current diffusion-based +methods do not fully utilize input information and usually require many +denoising iterations that lead to a long inference time or an additional +network for initialization. To address these challenges and facilitate the use +of diffusion models in multi-modal trajectory prediction, we propose GDTS, a +novel Goal-Guided Diffusion Model with Tree Sampling for multi-modal trajectory +prediction. Considering the "goal-driven" characteristics of human motion, GDTS +leverages goal estimation to guide the generation of the diffusion network. A +two-stage tree sampling algorithm is presented, which leverages common features +to reduce the inference time and improve accuracy for multi-modal prediction. +Experimental results demonstrate that our proposed framework achieves +comparable state-of-the-art performance with real-time inference speed in +public datasets. + +
+
+
+
+
+ + ♻ ☆ CromSS: Cross-modal pre-training with noisy labels for remote sensing + image segmentation ICLR + 2024 + + +
+ We explore the potential of large-scale noisily labeled data to enhance +feature learning by pretraining semantic segmentation models within a +multi-modal framework for geospatial applications. We propose a novel +Cross-modal Sample Selection (CromSS) method, a weakly supervised pretraining +strategy designed to improve feature representations through cross-modal +consistency and noise mitigation techniques. Unlike conventional pretraining +approaches, CromSS exploits massive amounts of noisy and easy-to-come-by labels +for improved feature learning beneficial to semantic segmentation tasks. We +investigate middle and late fusion strategies to optimize the multi-modal +pretraining architecture design. We also introduce a cross-modal sample +selection module to mitigate the adverse effects of label noise, which employs +a cross-modal entangling strategy to refine the estimated confidence masks +within each modality to guide the sampling process. Additionally, we introduce +a spatial-temporal label smoothing technique to counteract overconfidence for +enhanced robustness against noisy labels. To validate our approach, we +assembled the multi-modal dataset, NoLDO-S12, which consists of a large-scale +noisy label subset from Google's Dynamic World (DW) dataset for pretraining and +two downstream subsets with high-quality labels from Google DW and +OpenStreetMap (OSM) for transfer learning. Experimental results on two +downstream tasks and the publicly available DFC2020 dataset demonstrate that +when effectively utilized, the low-cost noisy labels can significantly enhance +feature learning for segmentation tasks. All data, code, and pretrained weights +will be made publicly available. + +
+
+ comment: The 1st short version was accepted as an oral presentation by ICLR + 2024 ML4RS workshop. The 2nd extended version is being under review +
+
+
+
+
+ + ♻ ☆ Doracamom: Joint 3D Detection and Occupancy Prediction with Multi-view + 4D Radars and Cameras for Omnidirectional Perception + + +
+ 3D object detection and occupancy prediction are critical tasks in autonomous +driving, attracting significant attention. Despite the potential of recent +vision-based methods, they encounter challenges under adverse conditions. Thus, +integrating cameras with next-generation 4D imaging radar to achieve unified +multi-task perception is highly significant, though research in this domain +remains limited. In this paper, we propose Doracamom, the first framework that +fuses multi-view cameras and 4D radar for joint 3D object detection and +semantic occupancy prediction, enabling comprehensive environmental perception. +Specifically, we introduce a novel Coarse Voxel Queries Generator that +integrates geometric priors from 4D radar with semantic features from images to +initialize voxel queries, establishing a robust foundation for subsequent +Transformer-based refinement. To leverage temporal information, we design a +Dual-Branch Temporal Encoder that processes multi-modal temporal features in +parallel across BEV and voxel spaces, enabling comprehensive spatio-temporal +representation learning. Furthermore, we propose a Cross-Modal BEV-Voxel Fusion +module that adaptively fuses complementary features through attention +mechanisms while employing auxiliary tasks to enhance feature quality. +Extensive experiments on the OmniHD-Scenes, View-of-Delft (VoD), and TJ4DRadSet +datasets demonstrate that Doracamom achieves state-of-the-art performance in +both tasks, establishing new benchmarks for multi-modal 3D perception. Code and +models will be publicly available. + +
+
+
+
+
+ + ♻ ☆ ADUGS-VINS: Generalized Visual-Inertial Odometry for Robust Navigation + in Highly Dynamic and Complex Environments + + +
+ Visual-inertial odometry (VIO) is widely used in various fields, such as +robots, drones, and autonomous vehicles. However, real-world scenes often +feature dynamic objects, compromising the accuracy of VIO. The diversity and +partial occlusion of these objects present a tough challenge for existing +dynamic VIO methods. To tackle this challenge, we introduce ADUGS-VINS, which +integrates an enhanced SORT algorithm along with a promptable foundation model +into VIO, thereby improving pose estimation accuracy in environments with +diverse dynamic objects and frequent occlusions. We evaluated our proposed +method using multiple public datasets representing various scenes, as well as +in a real-world scenario involving diverse dynamic objects. The experimental +results demonstrate that our proposed method performs impressively in multiple +scenarios, outperforming other state-of-the-art methods. This highlights its +remarkable generalization and adaptability in diverse dynamic environments, +showcasing its potential to handle various dynamic objects in practical +applications. + +
+
+
+
+
+ + ♻ ☆ Locality-aware Gaussian Compression for Fast and High-quality Rendering ICLR 2025 + + +
+ We present LocoGS, a locality-aware 3D Gaussian Splatting (3DGS) framework +that exploits the spatial coherence of 3D Gaussians for compact modeling of +volumetric scenes. To this end, we first analyze the local coherence of 3D +Gaussian attributes, and propose a novel locality-aware 3D Gaussian +representation that effectively encodes locally-coherent Gaussian attributes +using a neural field representation with a minimal storage requirement. On top +of the novel representation, LocoGS is carefully designed with additional +components such as dense initialization, an adaptive spherical harmonics +bandwidth scheme and different encoding schemes for different Gaussian +attributes to maximize compression performance. Experimental results +demonstrate that our approach outperforms the rendering quality of existing +compact Gaussian representations for representative real-world 3D datasets +while achieving from 54.6$\times$ to 96.6$\times$ compressed storage size and +from 2.1$\times$ to 2.4$\times$ rendering speed than 3DGS. Even our approach +also demonstrates an averaged 2.4$\times$ higher rendering speed than the +state-of-the-art compression method with comparable compression performance. + +
+
+ comment: Accepted to ICLR 2025. Project page: + https://seungjooshin.github.io/LocoGS +
+
+
+
+
+ + ♻ ☆ RALAD: Bridging the Real-to-Sim Domain Gap in Autonomous Driving with + Retrieval-Augmented Learning + + +
+ In the pursuit of robust autonomous driving systems, models trained on +real-world datasets often struggle to adapt to new environments, particularly +when confronted with corner cases such as extreme weather conditions. +Collecting these corner cases in the real world is non-trivial, which +necessitates the use of simulators for validation. However,the high +computational cost and the domain gap in data distribution have hindered the +seamless transition between real and simulated driving scenarios. To tackle +this challenge, we propose Retrieval-Augmented Learning for Autonomous Driving +(RALAD), a novel framework designed to bridge the real-to-sim gap at a low +cost. RALAD features three primary designs, including (1) domain adaptation via +an enhanced Optimal Transport (OT) method that accounts for both individual and +grouped image distances, (2) a simple and unified framework that can be applied +to various models, and (3) efficient fine-tuning techniques that freeze the +computationally expensive layers while maintaining robustness. Experimental +results demonstrate that RALAD compensates for the performance degradation in +simulated environments while maintaining accuracy in real-world scenarios +across three different models. Taking Cross View as an example, the mIOU and +mAP metrics in real-world scenarios remain stable before and after RALAD +fine-tuning, while in simulated environments,the mIOU and mAP metrics are +improved by 10.30% and 12.29%, respectively. Moreover, the re-training cost of +our approach is reduced by approximately 88.1%. Our code is available at +https://github.com/JiachengZuo/RALAD.git. + +
+
+
+
+
+ + ♻ ☆ Cross-Spectral Vision Transformer for Biometric Authentication using + Forehead Subcutaneous Vein Pattern and Periocular Pattern + + +
+ Traditional biometric systems have encountered significant setbacks due to +various unavoidable factors, for example, face recognition-based biometrics +fails due to the wearing of face masks and fingerprints create hygiene +concerns. This paper proposes a novel lightweight cross-spectral vision +transformer (CS-ViT) for biometric authentication using forehead subcutaneous +vein patterns and periocular patterns, offering a promising alternative to +traditional methods, capable of performing well even with the face masks and +without any physical touch. The proposed framework comprises a cross-spectral +dual-channel architecture designed to handle two distinct biometric traits and +to capture inter-dependencies in terms of relative spectral patterns. Each +channel consists of a Phase-Only Correlation Cross-Spectral Attention (POC-CSA) +that captures their individual as well as correlated patterns. The computation +of cross-spectral attention using POC extracts the phase correlation in the +spatial features. Therefore, it is robust against the resolution/intensity +variations and illumination of the input images, assuming both biometric traits +are from the same person. The lightweight model is suitable for edge device +deployment. The performance of the proposed algorithm was rigorously evaluated +using the Forehead Subcutaneous Vein Pattern and Periocular Biometric Pattern +(FSVP-PBP) database. The results demonstrated the superiority of the algorithm +over state-of-the-art methods, achieving a remarkable classification accuracy +of 98.8% with the combined vein and periocular patterns. + +
+
+ comment: Submitted to IEEE TPAMI +
+
+
+
+
+ + ♻ ☆ 3D-AffordanceLLM: Harnessing Large Language Models for Open-Vocabulary + Affordance Detection in 3D Worlds ICLR + + +
+ 3D Affordance detection is a challenging problem with broad applications on +various robotic tasks. Existing methods typically formulate the detection +paradigm as a label-based semantic segmentation task. This paradigm relies on +predefined labels and lacks the ability to comprehend complex natural language, +resulting in limited generalization in open-world scene. To address these +limitations, we reformulate the traditional affordance detection paradigm into +\textit{Instruction Reasoning Affordance Segmentation} (IRAS) task. This task +is designed to output a affordance mask region given a query reasoning text, +which avoids fixed categories of input labels. We accordingly propose the +\textit{3D-AffordanceLLM} (3D-ADLLM), a framework designed for reasoning +affordance detection in 3D open-scene. Specifically, 3D-ADLLM introduces large +language models (LLMs) to 3D affordance perception with a custom-designed +decoder for generating affordance masks, thus achieving open-world reasoning +affordance detection. In addition, given the scarcity of 3D affordance datasets +for training large models, we seek to extract knowledge from general +segmentation data and transfer it to affordance detection. Thus, we propose a +multi-stage training strategy that begins with a novel pre-training task, i.e., +\textit{Referring Object Part Segmentation}~(ROPS). This stage is designed to +equip the model with general recognition and segmentation capabilities at the +object-part level. Then followed by fine-tuning with the IRAS task, 3D-ADLLM +obtains the reasoning ability for affordance detection. In summary, 3D-ADLLM +leverages the rich world knowledge and human-object interaction reasoning +ability of LLMs, achieving approximately an 8\% improvement in mIoU on +open-vocabulary affordance detection tasks. + +
+
+ comment: ICLR +
+
+
+
+
+ + ♻ ☆ Representation Engineering: A Top-Down Approach to AI Transparency + + +
+ In this paper, we identify and characterize the emerging area of +representation engineering (RepE), an approach to enhancing the transparency of +AI systems that draws on insights from cognitive neuroscience. RepE places +population-level representations, rather than neurons or circuits, at the +center of analysis, equipping us with novel methods for monitoring and +manipulating high-level cognitive phenomena in deep neural networks (DNNs). We +provide baselines and an initial analysis of RepE techniques, showing that they +offer simple yet effective solutions for improving our understanding and +control of large language models. We showcase how these methods can provide +traction on a wide range of safety-relevant problems, including honesty, +harmlessness, power-seeking, and more, demonstrating the promise of top-down +transparency research. We hope that this work catalyzes further exploration of +RepE and fosters advancements in the transparency and safety of AI systems. + +
+
+ comment: Code is available at + https://github.com/andyzoujm/representation-engineering +
+
+
+
+
+ + ♻ ☆ Low-Biased General Annotated Dataset Generation + + +
+ Pre-training backbone networks on a general annotated dataset (e.g., +ImageNet) that comprises numerous manually collected images with category +annotations has proven to be indispensable for enhancing the generalization +capacity of downstream visual tasks. However, those manually collected images +often exhibit bias, which is non-transferable across either categories or +domains, thus causing the model's generalization capacity degeneration. To +mitigate this problem, we present an low-biased general annotated dataset +generation framework (lbGen). Instead of expensive manual collection, we aim at +directly generating low-biased images with category annotations. To achieve +this goal, we propose to leverage the advantage of a multimodal foundation +model (e.g., CLIP), in terms of aligning images in an low-biased semantic space +defined by language. Specifically, we develop a bi-level semantic alignment +loss, which not only forces all generated images to be consistent with the +semantic distribution of all categories belonging to the target dataset in an +adversarial learning manner, but also requires each generated image to match +the semantic description of its category name. In addition, we further cast an +existing image quality scoring model into a quality assurance loss to preserve +the quality of the generated image. By leveraging these two loss functions, we +can obtain an low-biased image generation model by simply fine-tuning a +pre-trained diffusion model using only all category names in the target dataset +as input. Experimental results confirm that, compared with the manually labeled +dataset or other synthetic datasets, the utilization of our generated +low-biased datasets leads to stable generalization capacity enhancement of +different backbone networks across various tasks, especially in tasks where the +manually labeled samples are scarce. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Structural-Entropy-Based Sample Selection for Efficient and Effective + Learning ICLR 2025 + + +
+ Sample selection improves the efficiency and effectiveness of machine +learning models by providing informative and representative samples. Typically, +samples can be modeled as a sample graph, where nodes are samples and edges +represent their similarities. Most existing methods are based on local +information, such as the training difficulty of samples, thereby overlooking +global information, such as connectivity patterns. This oversight can result in +suboptimal selection because global information is crucial for ensuring that +the selected samples well represent the structural properties of the graph. To +address this issue, we employ structural entropy to quantify global information +and losslessly decompose it from the whole graph to individual nodes using the +Shapley value. Based on the decomposition, we present +$\textbf{S}$tructural-$\textbf{E}$ntropy-based sample $\textbf{S}$election +($\textbf{SES}$), a method that integrates both global and local information to +select informative and representative samples. SES begins by constructing a +$k$NN-graph among samples based on their similarities. It then measures sample +importance by combining structural entropy (global metric) with training +difficulty (local metric). Finally, SES applies importance-biased blue noise +sampling to select a set of diverse and representative samples. Comprehensive +experiments on three learning scenarios -- supervised learning, active +learning, and continual learning -- clearly demonstrate the effectiveness of +our method. + +
+
+ comment: Published as a conference paper at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Dynamic Gaussians Mesh: Consistent Mesh Reconstruction from Dynamic + Scenes + + +
+ Modern 3D engines and graphics pipelines require mesh as a memory-efficient +representation, which allows efficient rendering, geometry processing, texture +editing, and many other downstream operations. However, it is still highly +difficult to obtain high-quality mesh in terms of detailed structure and time +consistency from dynamic observations. To this end, we introduce Dynamic +Gaussians Mesh (DG-Mesh), a framework to reconstruct a high-fidelity and +time-consistent mesh from dynamic input. Our work leverages the recent +advancement in 3D Gaussian Splatting to construct the mesh sequence with +temporal consistency from dynamic observations. Building on top of this +representation, DG-Mesh recovers high-quality meshes from the Gaussian points +and can track the mesh vertices over time, which enables applications such as +texture editing on dynamic objects. We introduce the Gaussian-Mesh Anchoring, +which encourages evenly distributed Gaussians, resulting better mesh +reconstruction through mesh-guided densification and pruning on the deformed +Gaussians. By applying cycle-consistent deformation between the canonical and +the deformed space, we can project the anchored Gaussian back to the canonical +space and optimize Gaussians across all time frames. During the evaluation on +different datasets, DG-Mesh provides significantly better mesh reconstruction +and rendering than baselines. Project page: https://www.liuisabella.com/DG-Mesh + +
+
+ comment: Project page: https://www.liuisabella.com/DG-Mesh +
+
+
+
+
+ + ♻ ☆ ESVO2: Direct Visual-Inertial Odometry with Stereo Event Cameras + + +
+ Event-based visual odometry is a specific branch of visual Simultaneous +Localization and Mapping (SLAM) techniques, which aims at solving tracking and +mapping subproblems (typically in parallel), by exploiting the special working +principles of neuromorphic (i.e., event-based) cameras. Due to the +motion-dependent nature of event data, explicit data association (i.e., feature +matching) under large-baseline view-point changes is difficult to establish, +making direct methods a more rational choice. However, state-of-the-art direct +methods are limited by the high computational complexity of the mapping +sub-problem and the degeneracy of camera pose tracking in certain degrees of +freedom (DoF) in rotation. In this paper, we tackle these issues by building an +event-based stereo visual-inertial odometry system on top of a direct pipeline. +Specifically, to speed up the mapping operation, we propose an efficient +strategy for sampling contour points according to the local dynamics of events. +The mapping performance is also improved in terms of structure completeness and +local smoothness by merging the temporal stereo and static stereo results. To +circumvent the degeneracy of camera pose tracking in recovering the pitch and +yaw components of general 6-DoF motion, we introduce IMU measurements as motion +priors via pre-integration. To this end, a compact back-end is proposed for +continuously updating the IMU bias and predicting the linear velocity, enabling +an accurate motion prediction for camera pose tracking. The resulting system +scales well with modern high-resolution event cameras and leads to better +global positioning accuracy in large-scale outdoor environments. Extensive +evaluations on five publicly available datasets featuring different resolutions +and scenarios justify the superior performance of the proposed system against +five state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ OMG: Opacity Matters in Material Modeling with Gaussian Splatting ICLR 2025 + + +
+ Decomposing geometry, materials and lighting from a set of images, namely +inverse rendering, has been a long-standing problem in computer vision and +graphics. Recent advances in neural rendering enable photo-realistic and +plausible inverse rendering results. The emergence of 3D Gaussian Splatting has +boosted it to the next level by showing real-time rendering potentials. An +intuitive finding is that the models used for inverse rendering do not take +into account the dependency of opacity w.r.t. material properties, namely cross +section, as suggested by optics. Therefore, we develop a novel approach that +adds this dependency to the modeling itself. Inspired by radiative transfer, we +augment the opacity term by introducing a neural network that takes as input +material properties to provide modeling of cross section and a physically +correct activation function. The gradients for material properties are +therefore not only from color but also from opacity, facilitating a constraint +for their optimization. Therefore, the proposed method incorporates more +accurate physical properties compared to previous works. We implement our +method into 3 different baselines that use Gaussian Splatting for inverse +rendering and achieve significant improvements universally in terms of novel +view synthesis and material modeling. + +
+
+ comment: Published as a conference paper at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ PATCH: a deep learning method to assess heterogeneity of artistic + practice in historical paintings + + +
+ The history of art has seen significant shifts in the manner in which +artworks are created, making understanding of creative processes a central +question in technical art history. In the Renaissance and Early Modern period, +paintings were largely produced by master painters directing workshops of +apprentices who often contributed to projects. The masters varied significantly +in artistic and managerial styles, meaning different combinations of artists +and implements might be seen both between masters and within workshops or even +individual canvases. Information on how different workshops were managed and +the processes by which artworks were created remains elusive. Machine learning +methods have potential to unearth new information about artists' creative +processes by extending the analysis of brushwork to a microscopic scale. +Analysis of workshop paintings, however, presents a challenge in that +documentation of the artists and materials involved is sparse, meaning external +examples are not available to train networks to recognize their contributions. +Here we present a novel machine learning approach we call pairwise assignment +training for classifying heterogeneity (PATCH) that is capable of identifying +individual artistic practice regimes with no external training data, or "ground +truth." The method achieves unsupervised results by supervised means, and +outperforms both simple statistical procedures and unsupervised machine +learning methods. We apply this method to two historical paintings by the +Spanish Renaissance master, El Greco: The Baptism of Christ and Christ on the +Cross with Landscape, and our findings regarding the former potentially +challenge previous work that has assigned the painting to workshop members. +Further, the results of our analyses create a measure of heterogeneity of +artistic practice that can be used to characterize artworks across time and +space. + +
+
+ comment: main text: 16 pages, 6 figures; SI: 7 pages, 3 figures; v2: minor + typo corrections, higher resolution figures +
+
+
+
+
+ + ♻ ☆ S-NeRF++: Autonomous Driving Simulation via Neural Reconstruction and + Generation + + +
+ Autonomous driving simulation system plays a crucial role in enhancing +self-driving data and simulating complex and rare traffic scenarios, ensuring +navigation safety. However, traditional simulation systems, which often heavily +rely on manual modeling and 2D image editing, struggled with scaling to +extensive scenes and generating realistic simulation data. In this study, we +present S-NeRF++, an innovative autonomous driving simulation system based on +neural reconstruction. Trained on widely-used self-driving datasets such as +nuScenes and Waymo, S-NeRF++ can generate a large number of realistic street +scenes and foreground objects with high rendering quality as well as offering +considerable flexibility in manipulation and simulation. Specifically, S-NeRF++ +is an enhanced neural radiance field for synthesizing large-scale scenes and +moving vehicles, with improved scene parameterization and camera pose learning. +The system effectively utilizes noisy and sparse LiDAR data to refine training +and address depth outliers, ensuring high-quality reconstruction and novel-view +rendering. It also provides a diverse foreground asset bank by reconstructing +and generating different foreground vehicles to support comprehensive scenario +creation.Moreover, we have developed an advanced foreground-background fusion +pipeline that skillfully integrates illumination and shadow effects, further +enhancing the realism of our simulations. With the high-quality simulated data +provided by our S-NeRF++, we found the perception methods enjoy performance +boosts on several autonomous driving downstream tasks, further demonstrating +our proposed simulator's effectiveness. + +
+
+ comment: IEEE TPAMI 2025 +
+
+
+
+
+ + ♻ ☆ AdvLogo: Adversarial Patch Attack against Object Detectors based on + Diffusion Models + + +
+ With the rapid development of deep learning, object detectors have +demonstrated impressive performance; however, vulnerabilities still exist in +certain scenarios. Current research exploring the vulnerabilities using +adversarial patches often struggles to balance the trade-off between attack +effectiveness and visual quality. To address this problem, we propose a novel +framework of patch attack from semantic perspective, which we refer to as +AdvLogo. Based on the hypothesis that every semantic space contains an +adversarial subspace where images can cause detectors to fail in recognizing +objects, we leverage the semantic understanding of the diffusion denoising +process and drive the process to adversarial subareas by perturbing the latent +and unconditional embeddings at the last timestep. To mitigate the distribution +shift that exposes a negative impact on image quality, we apply perturbation to +the latent in frequency domain with the Fourier Transform. Experimental results +demonstrate that AdvLogo achieves strong attack performance while maintaining +high visual quality. + +
+
+
+
+
+ + ♻ ☆ DynamicCity: Large-Scale 4D Occupancy Generation from Dynamic Scenes ICLR 2025 + + +
+ Urban scene generation has been developing rapidly recently. However, +existing methods primarily focus on generating static and single-frame scenes, +overlooking the inherently dynamic nature of real-world driving environments. +In this work, we introduce DynamicCity, a novel 4D occupancy generation +framework capable of generating large-scale, high-quality dynamic 4D scenes +with semantics. DynamicCity mainly consists of two key models. 1) A VAE model +for learning HexPlane as the compact 4D representation. Instead of using naive +averaging operations, DynamicCity employs a novel Projection Module to +effectively compress 4D features into six 2D feature maps for HexPlane +construction, which significantly enhances HexPlane fitting quality (up to +12.56 mIoU gain). Furthermore, we utilize an Expansion & Squeeze Strategy to +reconstruct 3D feature volumes in parallel, which improves both network +training efficiency and reconstruction accuracy than naively querying each 3D +point (up to 7.05 mIoU gain, 2.06x training speedup, and 70.84% memory +reduction). 2) A DiT-based diffusion model for HexPlane generation. To make +HexPlane feasible for DiT generation, a Padded Rollout Operation is proposed to +reorganize all six feature planes of the HexPlane as a squared 2D feature map. +In particular, various conditions could be introduced in the diffusion or +sampling process, supporting versatile 4D generation applications, such as +trajectory- and command-driven generation, inpainting, and layout-conditioned +generation. Extensive experiments on the CarlaSC and Waymo datasets demonstrate +that DynamicCity significantly outperforms existing state-of-the-art 4D +occupancy generation methods across multiple metrics. The code and models have +been released to facilitate future research. + +
+
+ comment: ICLR 2025 Spotlight; 35 pages, 18 figures, 15 tables; Project Page at + https://dynamic-city.github.io/ +
+
+
+
+
+ + ♻ ☆ Calib3D: Calibrating Model Preferences for Reliable 3D Scene + Understanding WACV 2025 + + +
+ Safety-critical 3D scene understanding tasks necessitate not only accurate +but also confident predictions from 3D perception models. This study introduces +Calib3D, a pioneering effort to benchmark and scrutinize the reliability of 3D +scene understanding models from an uncertainty estimation viewpoint. We +comprehensively evaluate 28 state-of-the-art models across 10 diverse 3D +datasets, uncovering insightful phenomena that cope with both the aleatoric and +epistemic uncertainties in 3D scene understanding. We discover that despite +achieving impressive levels of accuracy, existing models frequently fail to +provide reliable uncertainty estimates -- a pitfall that critically undermines +their applicability in safety-sensitive contexts. Through extensive analysis of +key factors such as network capacity, LiDAR representations, rasterization +resolutions, and 3D data augmentation techniques, we correlate these aspects +directly with the model calibration efficacy. Furthermore, we introduce DeptS, +a novel depth-aware scaling approach aimed at enhancing 3D model calibration. +Extensive experiments across a wide range of configurations validate the +superiority of our method. We hope this work could serve as a cornerstone for +fostering reliable 3D scene understanding. Code and benchmark toolkit are +publicly available. + +
+
+ comment: WACV 2025 Oral; 26 pages, 8 figures, 12 tables; Code at + https://github.com/ldkong1205/Calib3D +
+
+
+
+
+ + ♻ ☆ Tuning Timestep-Distilled Diffusion Model Using Pairwise Sample + Optimization + + +
+ Recent advancements in timestep-distilled diffusion models have enabled +high-quality image generation that rivals non-distilled multi-step models, but +with significantly fewer inference steps. While such models are attractive for +applications due to the low inference cost and latency, fine-tuning them with a +naive diffusion objective would result in degraded and blurry outputs. An +intuitive alternative is to repeat the diffusion distillation process with a +fine-tuned teacher model, which produces good results but is cumbersome and +computationally intensive; the distillation training usually requires magnitude +higher of training compute compared to fine-tuning for specific image styles. +In this paper, we present an algorithm named pairwise sample optimization +(PSO), which enables the direct fine-tuning of an arbitrary timestep-distilled +diffusion model. PSO introduces additional reference images sampled from the +current time-step distilled model, and increases the relative likelihood margin +between the training images and reference images. This enables the model to +retain its few-step generation ability, while allowing for fine-tuning of its +output distribution. We also demonstrate that PSO is a generalized formulation +which can be flexibly extended to both offline-sampled and online-sampled +pairwise data, covering various popular objectives for diffusion model +preference optimization. We evaluate PSO in both preference optimization and +other fine-tuning tasks, including style transfer and concept customization. We +show that PSO can directly adapt distilled models to human-preferred generation +with both offline and online-generated pairwise preference image data. PSO also +demonstrates effectiveness in style transfer and concept customization by +directly tuning timestep-distilled diffusion models. + +
+
+
+
+
+ + ♻ ☆ FlexDrive: Toward Trajectory Flexibility in Driving Scene Reconstruction + and Rendering + + +
+ Driving scene reconstruction and rendering have advanced significantly using +the 3D Gaussian Splatting. However, most prior research has focused on the +rendering quality along a pre-recorded vehicle path and struggles to generalize +to out-of-path viewpoints, which is caused by the lack of high-quality +supervision in those out-of-path views. To address this issue, we introduce an +Inverse View Warping technique to create compact and high-quality images as +supervision for the reconstruction of the out-of-path views, enabling +high-quality rendering results for those views. For accurate and robust inverse +view warping, a depth bootstrap strategy is proposed to obtain on-the-fly dense +depth maps during the optimization process, overcoming the sparsity and +incompleteness of LiDAR depth data. Our method achieves superior in-path and +out-of-path reconstruction and rendering performance on the widely used Waymo +Open dataset. In addition, a simulator-based benchmark is proposed to obtain +the out-of-path ground truth and quantitatively evaluate the performance of +out-of-path rendering, where our method outperforms previous methods by a +significant margin. + +
+
+
+
+
+ + ♻ ☆ Semantically Structured Image Compression via Irregular Group-Based + Decoupling ICCV2023 + + +
+ Image compression techniques typically focus on compressing rectangular +images for human consumption, however, resulting in transmitting redundant +content for downstream applications. To overcome this limitation, some previous +works propose to semantically structure the bitstream, which can meet specific +application requirements by selective transmission and reconstruction. +Nevertheless, they divide the input image into multiple rectangular regions +according to semantics and ignore avoiding information interaction among them, +causing waste of bitrate and distorted reconstruction of region boundaries. In +this paper, we propose to decouple an image into multiple groups with irregular +shapes based on a customized group mask and compress them independently. Our +group mask describes the image at a finer granularity, enabling significant +bitrate saving by reducing the transmission of redundant content. Moreover, to +ensure the fidelity of selective reconstruction, this paper proposes the +concept of group-independent transform that maintain the independence among +distinct groups. And we instantiate it by the proposed Group-Independent +Swin-Block (GI Swin-Block). Experimental results demonstrate that our framework +structures the bitstream with negligible cost, and exhibits superior +performance on both visual quality and intelligent task supporting. + +
+
+ comment: Accept by ICCV2023 +
+
+
+
+
+ + ♻ ☆ Learning to Learn Weight Generation via Trajectory Diffusion + + +
+ Diffusion-based algorithms have emerged as promising techniques for weight +generation, particularly in scenarios like multi-task learning that require +frequent weight updates. However, existing solutions suffer from limited +cross-task transferability. In addition, they only utilize optimal weights as +training samples, ignoring the value of other weights in the optimization +process. To address these issues, we propose Lt-Di, which integrates the +diffusion algorithm with meta-learning to generate weights for unseen tasks. +Furthermore, we extend the vanilla diffusion algorithm into a trajectory +diffusion algorithm to utilize other weights along the optimization trajectory. +Trajectory diffusion decomposes the entire diffusion chain into multiple +shorter ones, improving training and inference efficiency. We analyze the +convergence properties of the weight generation paradigm and improve +convergence efficiency without additional time overhead. Our experiments +demonstrate Lt-Di's higher accuracy while reducing computational overhead +across various tasks, including zero-shot and few-shot learning, multi-domain +generalization, and large-scale language model fine-tuning.Our code is released +at https://anonymous.4open.science/r/Lt-Di-0E51. + +
+
+
+
+
+ + ♻ ☆ Multi-modal AI for comprehensive breast cancer prognostication + + +
+ Treatment selection in breast cancer is guided by molecular subtypes and +clinical characteristics. However, current tools including genomic assays lack +the accuracy required for optimal clinical decision-making. We developed a +novel artificial intelligence (AI)-based approach that integrates digital +pathology images with clinical data, providing a more robust and effective +method for predicting the risk of cancer recurrence in breast cancer patients. +Specifically, we utilized a vision transformer pan-cancer foundation model +trained with self-supervised learning to extract features from digitized +H&E-stained slides. These features were integrated with clinical data to form a +multi-modal AI test predicting cancer recurrence and death. The test was +developed and evaluated using data from a total of 8,161 female breast cancer +patients across 15 cohorts originating from seven countries. Of these, 3,502 +patients from five cohorts were used exclusively for evaluation, while the +remaining patients were used for training. Our test accurately predicted our +primary endpoint, disease-free interval, in the five evaluation cohorts +(C-index: 0.71 [0.68-0.75], HR: 3.63 [3.02-4.37, p<0.001]). In a direct +comparison (n=858), the AI test was more accurate than Oncotype DX, the +standard-of-care 21-gene assay, achieving a C-index of 0.67 [0.61-0.74] versus +0.61 [0.49-0.73], respectively. Additionally, the AI test added independent +prognostic information to Oncotype DX in a multivariate analysis (HR: 3.11 +[1.91-5.09, p<0.001)]). The test demonstrated robust accuracy across major +molecular breast cancer subtypes, including TNBC (C-index: 0.71 [0.62-0.81], +HR: 3.81 [2.35-6.17, p=0.02]), where no diagnostic tools are currently +recommended by clinical guidelines. These results suggest that our AI test +improves upon the accuracy of existing prognostic tests, while being applicable +to a wider range of patients. + +
+
+
+
+
+ + ♻ ☆ A Survey on Vision-Language-Action Models for Embodied AI + + +
+ Embodied AI is widely recognized as a key element of artificial general +intelligence because it involves controlling embodied agents to perform tasks +in the physical world. Building on the success of large language models and +vision-language models, a new category of multimodal models -- referred to as +vision-language-action models (VLAs) -- has emerged to address +language-conditioned robotic tasks in embodied AI by leveraging their distinct +ability to generate actions. In recent years, a myriad of VLAs have been +developed, making it imperative to capture the rapidly evolving landscape +through a comprehensive survey. To this end, we present the first survey on +VLAs for embodied AI. This work provides a detailed taxonomy of VLAs, organized +into three major lines of research. The first line focuses on individual +components of VLAs. The second line is dedicated to developing control policies +adept at predicting low-level actions. The third line comprises high-level task +planners capable of decomposing long-horizon tasks into a sequence of subtasks, +thereby guiding VLAs to follow more general user instructions. Furthermore, we +provide an extensive summary of relevant resources, including datasets, +simulators, and benchmarks. Finally, we discuss the challenges faced by VLAs +and outline promising future directions in embodied AI. + +
+
+ comment: 16 pages, a survey of vision-language-action models +
+
+
+
+
+ + ♻ ☆ GAMED-Snake: Gradient-aware Adaptive Momentum Evolution Deep Snake Model + for Multi-organ Segmentation + + +
+ Multi-organ segmentation is a critical yet challenging task due to complex +anatomical backgrounds, blurred boundaries, and diverse morphologies. This +study introduces the Gradient-aware Adaptive Momentum Evolution Deep Snake +(GAMED-Snake) model, which establishes a novel paradigm for contour-based +segmentation by integrating gradient-based learning with adaptive momentum +evolution mechanisms. The GAMED-Snake model incorporates three major +innovations: First, the Distance Energy Map Prior (DEMP) generates a +pixel-level force field that effectively attracts contour points towards the +true boundaries, even in scenarios with complex backgrounds and blurred edges. +Second, the Differential Convolution Inception Module (DCIM) precisely extracts +comprehensive energy gradients, significantly enhancing segmentation accuracy. +Third, the Adaptive Momentum Evolution Mechanism (AMEM) employs cross-attention +to establish dynamic features across different iterations of evolution, +enabling precise boundary alignment for diverse morphologies. Experimental +results on four challenging multi-organ segmentation datasets demonstrate that +GAMED-Snake improves the mDice metric by approximately 2% compared to +state-of-the-art methods. Code will be available at +https://github.com/SYSUzrc/GAMED-Snake. + +
+
+
+
+
+ + ♻ ☆ Noise2Score3D:Unsupervised Tweedie's Approach for Point Cloud Denoising + + +
+ Building on recent advances in Bayesian statistics and image denoising, we +propose Noise2Score3D, a fully unsupervised framework for point cloud denoising +that addresses the critical challenge of limited availability of clean data. +Noise2Score3D learns the gradient of the underlying point cloud distribution +directly from noisy data, eliminating the need for clean data during training. +By leveraging Tweedie's formula, our method performs inference in a single +step, avoiding the iterative processes used in existing unsupervised methods, +thereby improving both performance and efficiency. Experimental results +demonstrate that Noise2Score3D achieves state-of-the-art performance on +standard benchmarks, outperforming other unsupervised methods in Chamfer +distance and point-to-mesh metrics, and rivaling some supervised approaches. +Furthermore, Noise2Score3D demonstrates strong generalization ability beyond +training datasets. Additionally, we introduce Total Variation for Point Cloud, +a criterion that allows for the estimation of unknown noise parameters, which +further enhances the method's versatility and real-world utility. + +
+
+
+
+
+ + ♻ ☆ MMed-RAG: Versatile Multimodal RAG System for Medical Vision Language + Models ICLR 2025 + + +
+ Artificial Intelligence (AI) has demonstrated significant potential in +healthcare, particularly in disease diagnosis and treatment planning. Recent +progress in Medical Large Vision-Language Models (Med-LVLMs) has opened up new +possibilities for interactive diagnostic tools. However, these models often +suffer from factual hallucination, which can lead to incorrect diagnoses. +Fine-tuning and retrieval-augmented generation (RAG) have emerged as methods to +address these issues. However, the amount of high-quality data and distribution +shifts between training data and deployment data limit the application of +fine-tuning methods. Although RAG is lightweight and effective, existing +RAG-based approaches are not sufficiently general to different medical domains +and can potentially cause misalignment issues, both between modalities and +between the model and the ground truth. In this paper, we propose a versatile +multimodal RAG system, MMed-RAG, designed to enhance the factuality of +Med-LVLMs. Our approach introduces a domain-aware retrieval mechanism, an +adaptive retrieved contexts selection method, and a provable RAG-based +preference fine-tuning strategy. These innovations make the RAG process +sufficiently general and reliable, significantly improving alignment when +introducing retrieved contexts. Experimental results across five medical +datasets (involving radiology, ophthalmology, pathology) on medical VQA and +report generation demonstrate that MMed-RAG can achieve an average improvement +of 43.8% in the factual accuracy of Med-LVLMs. Our data and code are available +in https://github.com/richard-peng-xia/MMed-RAG. + +
+
+ comment: ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Towards Generalizable Scene Change Detection CVPR 2025 + + +
+ While current state-of-the-art Scene Change Detection (SCD) approaches +achieve impressive results in well-trained research data, they become +unreliable under unseen environments and different temporal conditions; +in-domain performance drops from 77.6\% to 8.0\% in a previously unseen +environment and to 4.6\% under a different temporal condition -- calling for +generalizable SCD and benchmark. In this work, we propose the Generalizable +Scene Change Detection Framework (GeSCF), which addresses unseen domain +performance and temporal consistency -- to meet the growing demand for anything +SCD. Our method leverages the pre-trained Segment Anything Model (SAM) in a +zero-shot manner. For this, we design Initial Pseudo-mask Generation and +Geometric-Semantic Mask Matching -- seamlessly turning user-guided prompt and +single-image based segmentation into scene change detection for a pair of +inputs without guidance. Furthermore, we define the Generalizable Scene Change +Detection (GeSCD) benchmark along with novel metrics and an evaluation protocol +to facilitate SCD research in generalizability. In the process, we introduce +the ChangeVPR dataset, a collection of challenging image pairs with diverse +environmental scenarios -- including urban, suburban, and rural settings. +Extensive experiments across various datasets demonstrate that GeSCF achieves +an average performance gain of 19.2\% on existing SCD datasets and 30.0\% on +the ChangeVPR dataset, nearly doubling the prior art performance. We believe +our work can lay a solid foundation for robust and generalizable SCD research. + +
+
+ comment: Manuscript. Accepted to CVPR 2025 +
+
+
+
+
+ + ♻ ☆ MAA: Meticulous Adversarial Attack against Vision-Language Pre-trained + Models + + +
+ Current adversarial attacks for evaluating the robustness of vision-language +pre-trained (VLP) models in multi-modal tasks suffer from limited +transferability, where attacks crafted for a specific model often struggle to +generalize effectively across different models, limiting their utility in +assessing robustness more broadly. This is mainly attributed to the +over-reliance on model-specific features and regions, particularly in the image +modality. In this paper, we propose an elegant yet highly effective method +termed Meticulous Adversarial Attack (MAA) to fully exploit model-independent +characteristics and vulnerabilities of individual samples, achieving enhanced +generalizability and reduced model dependence. MAA emphasizes fine-grained +optimization of adversarial images by developing a novel resizing and sliding +crop (RScrop) technique, incorporating a multi-granularity similarity +disruption (MGSD) strategy. Extensive experiments across diverse VLP models, +multiple benchmark datasets, and a variety of downstream tasks demonstrate that +MAA significantly enhances the effectiveness and transferability of adversarial +attacks. A large cohort of performance studies is conducted to generate +insights into the effectiveness of various model configurations, guiding future +advancements in this domain. + +
+
+
+
+
+ + ♻ ☆ RobotFingerPrint: Unified Gripper Coordinate Space for Multi-Gripper + Grasp Synthesis and Transfer + + +
+ We introduce a novel grasp representation named the Unified Gripper +Coordinate Space (UGCS) for grasp synthesis and grasp transfer. Our +representation leverages spherical coordinates to create a shared coordinate +space across different robot grippers, enabling it to synthesize and transfer +grasps for both novel objects and previously unseen grippers. The strength of +this representation lies in the ability to map palm and fingers of a gripper +and the unified coordinate space. Grasp synthesis is formulated as predicting +the unified spherical coordinates on object surface points via a conditional +variational autoencoder. The predicted unified gripper coordinates establish +exact correspondences between the gripper and object points, which is used to +optimize grasp pose and joint values. Grasp transfer is facilitated through the +point-to-point correspondence between any two (potentially unseen) grippers and +solved via a similar optimization. Extensive simulation and real-world +experiments showcase the efficacy of the unified grasp representation for grasp +synthesis in generating stable and diverse grasps. Similarly, we showcase +real-world grasp transfer from human demonstrations across different objects. + +
+
+ comment: 8 pages, 11 figures, 3 tables. Project page available at + https://irvlutd.github.io/RobotFingerPrint +
+
+
+
+
+ + ♻ ☆ The Labyrinth of Links: Navigating the Associative Maze of Multi-modal + LLMs ICLR 2025 + + +
+ Multi-modal Large Language Models (MLLMs) have exhibited impressive +capability. However, recently many deficiencies of MLLMs have been found +compared to human intelligence, $\textit{e.g.}$, hallucination. To drive the +MLLMs study, the community dedicated efforts to building larger benchmarks with +complex tasks. In this paper, we propose benchmarking an essential but usually +overlooked intelligence: $\textbf{association}$, a human's basic capability to +link observation and prior practice memory. To comprehensively investigate +MLLM's performance on the association, we formulate the association task and +devise a standard benchmark based on adjective and verb semantic concepts. +Instead of costly data annotation and curation, we propose a convenient +$\textbf{annotation-free}$ construction method transforming the general dataset +for our association tasks. Simultaneously, we devise a rigorous data refinement +process to eliminate confusion in the raw dataset. Building on this database, +we establish three levels of association tasks: single-step, synchronous, and +asynchronous associations. Moreover, we conduct a comprehensive investigation +into the MLLMs' zero-shot association capabilities, addressing multiple +dimensions, including three distinct memory strategies, both open-source and +closed-source MLLMs, cutting-edge Mixture-of-Experts (MoE) models, and the +involvement of human experts. Our systematic investigation shows that current +open-source MLLMs consistently exhibit poor capability in our association +tasks, even the currently state-of-the-art GPT-4V(vision) also has a +significant gap compared to humans. We believe our benchmark would pave the way +for future MLLM studies. $\textit{Our data and code are available at:}$ +https://mvig-rhos.com/llm_inception. + +
+
+ comment: Accepted by ICLR 2025. Project page: + https://mvig-rhos.com/llm_inception +
+
+
+
+
+ + ♻ ☆ Self-Supervised Contrastive Learning for Videos using Differentiable + Local Alignment BMVC + + +
+ Robust frame-wise embeddings are essential to perform video analysis and +understanding tasks. We present a self-supervised method for representation +learning based on aligning temporal video sequences. Our framework uses a +transformer-based encoder to extract frame-level features and leverages them to +find the optimal alignment path between video sequences. We introduce the novel +Local-Alignment Contrastive (LAC) loss, which combines a differentiable local +alignment loss to capture local temporal dependencies with a contrastive loss +to enhance discriminative learning. Prior works on video alignment have focused +on using global temporal ordering across sequence pairs, whereas our loss +encourages identifying the best-scoring subsequence alignment. LAC uses the +differentiable Smith-Waterman (SW) affine method, which features a flexible +parameterization learned through the training phase, enabling the model to +adjust the temporal gap penalty length dynamically. Evaluations show that our +learned representations outperform existing state-of-the-art approaches on +action recognition tasks. + +
+
+ comment: Accepted in 2nd Workshop on Video Understanding and its Applications, + held in conjunction with the British Machine Vision Conference (BMVC) 2024 +
+
+
+
+
+
+
+
+ + Artificial Intelligence 73 + +
+
+
+ + ♻ ☆ Revisiting the Test-Time Scaling of o1-like Models: Do they Truly + Possess Test-Time Scaling Capabilities? + + +
+ The advent of test-time scaling in large language models (LLMs), exemplified +by OpenAI's o1 series, has advanced reasoning capabilities by scaling +computational resource allocation during inference. While successors like QwQ, +Deepseek-R1 (R1) and LIMO replicate these advancements, whether these models +truly possess test-time scaling capabilities remains underexplored. This study +found that longer CoTs of these o1-like models do not consistently enhance +accuracy; in fact, correct solutions are often shorter than incorrect ones for +the same questions. Further investigation shows this phenomenon is closely +related to models' self-revision capabilities - longer CoTs contain more +self-revisions, which often lead to performance degradation. We then compare +sequential and parallel scaling strategies on QwQ, R1 and LIMO, finding that +parallel scaling achieves better coverage and scalability. Based on these +insights, we propose Shortest Majority Vote, a method that combines parallel +scaling strategies with CoT length characteristics, significantly improving +models' test-time scalability compared to conventional majority voting +approaches. + +
+
+ comment: Add the github link +
+
+
+
+
+ + ♻ ☆ CNsum:Automatic Summarization for Chinese News Text + + +
+ Obtaining valuable information from massive data efficiently has become our +research goal in the era of Big Data. Text summarization technology has been +continuously developed to meet this demand. Recent work has also shown that +transformer-based pre-trained language models have achieved great success on +various tasks in Natural Language Processing (NLP). Aiming at the problem of +Chinese news text summary generation and the application of Transformer +structure on Chinese, this paper proposes a Chinese news text summarization +model (CNsum) based on Transformer structure, and tests it on Chinese datasets +such as THUCNews. The results of the conducted experiments show that CNsum +achieves better ROUGE score than the baseline models, which verifies the +outperformance of the model. + +
+
+ comment: This withdrawal is due to the lack of authorization from all + co-authors for the publication of this version +
+
+
+
+
+ + ♻ ☆ Kinetix: Investigating the Training of General Agents through Open-Ended + Physics-Based Control Tasks ICLR 2025 + + +
+ While large models trained with self-supervised learning on offline datasets +have shown remarkable capabilities in text and image domains, achieving the +same generalisation for agents that act in sequential decision problems remains +an open challenge. In this work, we take a step towards this goal by +procedurally generating tens of millions of 2D physics-based tasks and using +these to train a general reinforcement learning (RL) agent for physical +control. To this end, we introduce Kinetix: an open-ended space of +physics-based RL environments that can represent tasks ranging from robotic +locomotion and grasping to video games and classic RL environments, all within +a unified framework. Kinetix makes use of our novel hardware-accelerated +physics engine Jax2D that allows us to cheaply simulate billions of environment +steps during training. Our trained agent exhibits strong physical reasoning +capabilities in 2D space, being able to zero-shot solve unseen human-designed +environments. Furthermore, fine-tuning this general agent on tasks of interest +shows significantly stronger performance than training an RL agent *tabula +rasa*. This includes solving some environments that standard RL training +completely fails at. We believe this demonstrates the feasibility of large +scale, mixed-quality pre-training for online RL and we hope that Kinetix will +serve as a useful framework to investigate this further. + +
+
+ comment: ICLR 2025 Oral. The first two authors contributed equally. Project + page located at: https://kinetix-env.github.io/ +
+
+
+
+
+ + ♻ ☆ OpenReviewer: A Specialized Large Language Model for Generating Critical + Scientific Paper Reviews NAACL 2025 + + +
+ We present OpenReviewer, an open-source system for generating high-quality +peer reviews of machine learning and AI conference papers. At its core is +Llama-OpenReviewer-8B, an 8B parameter language model specifically fine-tuned +on 79,000 expert reviews from top conferences. Given a PDF paper submission and +review template as input, OpenReviewer extracts the full text, including +technical content like equations and tables, and generates a structured review +following conference-specific guidelines. Our evaluation on 400 test papers +shows that OpenReviewer produces considerably more critical and realistic +reviews compared to general-purpose LLMs like GPT-4 and Claude-3.5. While other +LLMs tend toward overly positive assessments, OpenReviewer's recommendations +closely match the distribution of human reviewer ratings. The system provides +authors with rapid, constructive feedback to improve their manuscripts before +submission, though it is not intended to replace human peer review. +OpenReviewer is available as an online demo and open-source tool. + +
+
+ comment: Demo: https://huggingface.co/spaces/maxidl/openreviewer Model: + https://huggingface.co/maxidl/Llama-OpenReviewer-8B To appear at NAACL 2025 + System Demonstrations Track +
+
+
+
+
+ + ♻ ☆ CUIfy the XR: An Open-Source Package to Embed LLM-powered Conversational + Agents in XR + + +
+ Recent developments in computer graphics, machine learning, and sensor +technologies enable numerous opportunities for extended reality (XR) setups for +everyday life, from skills training to entertainment. With large corporations +offering affordable consumer-grade head-mounted displays (HMDs), XR will likely +become pervasive, and HMDs will develop as personal devices like smartphones +and tablets. However, having intelligent spaces and naturalistic interactions +in XR is as important as technological advances so that users grow their +engagement in virtual and augmented spaces. To this end, large language model +(LLM)--powered non-player characters (NPCs) with speech-to-text (STT) and +text-to-speech (TTS) models bring significant advantages over conventional or +pre-scripted NPCs for facilitating more natural conversational user interfaces +(CUIs) in XR. This paper provides the community with an open-source, +customizable, extendable, and privacy-aware Unity package, CUIfy, that +facilitates speech-based NPC-user interaction with widely used LLMs, STT, and +TTS models. Our package also supports multiple LLM-powered NPCs per environment +and minimizes latency between different computational models through streaming +to achieve usable interactions between users and NPCs. We publish our source +code in the following repository: https://gitlab.lrz.de/hctl/cuify + +
+
+ comment: 7th IEEE International Conference on Artificial Intelligence & + eXtended and Virtual Reality (IEEE AIxVR 2025) +
+
+
+
+
+ + ♻ ☆ Evaluating Intelligence via Trial and Error + + +
+ Intelligence is a crucial trait for species to find solutions within a +limited number of trial-and-error attempts. Building on this idea, we introduce +Survival Game as a framework to evaluate intelligence based on the number of +failed attempts in a trial-and-error process. Fewer failures indicate higher +intelligence. When the expectation and variance of failure counts are both +finite, it signals the ability to consistently find solutions to new +challenges, which we define as the Autonomous Level of intelligence. Using +Survival Game, we comprehensively evaluate existing AI systems. Our results +show that while AI systems achieve the Autonomous Level in simple tasks, they +are still far from it in more complex tasks, such as vision, search, +recommendation, and language. While scaling current AI technologies might help, +this would come at an astronomical cost. Projections suggest that achieving the +Autonomous Level for general tasks would require $10^{26}$ parameters. To put +this into perspective, loading such a massive model requires so many H100 GPUs +that their total value is $10^{7}$ times that of Apple Inc.'s market value. +Even with Moore's Law, supporting such a parameter scale would take $70$ years. +This staggering cost highlights the complexity of human tasks and the +inadequacies of current AI technologies. To further investigate this +phenomenon, we conduct a theoretical analysis of Survival Game and its +experimental results. Our findings suggest that human tasks possess a +criticality property. As a result, Autonomous Level requires a deep +understanding of the task's underlying mechanisms. Current AI systems, however, +do not fully grasp these mechanisms and instead rely on superficial mimicry, +making it difficult for them to reach an autonomous level. We believe Survival +Game can not only guide the future development of AI but also offer profound +insights into human intelligence. + +
+
+
+
+
+ + ♻ ☆ AnyECG: Foundational Models for Multitask Cardiac Analysis in Real-World + Settings + + +
+ Electrocardiogram (ECG), a non-invasive and affordable tool for cardiac +monitoring, is highly sensitive in detecting acute heart attacks. However, due +to the lengthy nature of ECG recordings, numerous machine learning methods have +been developed for automated heart disease detection to reduce human workload. +Despite these efforts, performance remains suboptimal. A key obstacle is the +inherent complexity of ECG data, which includes heterogeneity (e.g., varying +sampling rates), high levels of noise, demographic-related pattern shifts, and +intricate rhythm-event associations. To overcome these challenges, this paper +introduces AnyECG, a foundational model designed to extract robust +representations from any real-world ECG data. Specifically, a tailored ECG +Tokenizer encodes each fixed-duration ECG fragment into a token and, guided by +proxy tasks, converts noisy, continuous ECG features into discrete, compact, +and clinically meaningful local rhythm codes. These codes encapsulate basic +morphological, frequency, and demographic information (e.g., sex), effectively +mitigating signal noise. We further pre-train the AnyECG to learn rhythmic +pattern associations across ECG tokens, enabling the capture of cardiac event +semantics. By being jointly pre-trained on diverse ECG data sources, AnyECG is +capable of generalizing across a wide range of downstream tasks where ECG +signals are recorded from various devices and scenarios. The experimental +results show that AnyECG achieves an average performance improvement of 6% +across four critical tasks-anomaly detection, arrhythmia classification, +corrupted lead generation, and ultra-long ECG recognition. AnyECG learns common +ECG rhythm from data and significantly outperforms state-of-the-art methods in +each of these tasks. + +
+
+
+
+
+ + ♻ ☆ MOOSE-Chem: Large Language Models for Rediscovering Unseen Chemistry + Scientific Hypotheses ICLR 2025 + + +
+ Scientific discovery contributes largely to human society's prosperity, and +recent progress shows that LLMs could potentially catalyze this process. +However, it is still unclear whether LLMs can discover novel and valid +hypotheses in chemistry. In this work, we investigate this central research +question: Can LLMs automatically discover novel and valid chemistry research +hypotheses given only a chemistry research background (consisting of a research +question and/or a background survey), without limitation on the domain of the +research question? After extensive discussions with chemistry experts, we +propose an assumption that a majority of chemistry hypotheses can be resulted +from a research background and several inspirations. With this key insight, we +break the central question into three smaller fundamental questions. In brief, +they are: (1) given a background question, whether LLMs can retrieve good +inspirations; (2) with background and inspirations, whether LLMs can lead to +hypothesis; and (3) whether LLMs can identify good hypotheses to rank them +higher. To investigate these questions, we construct a benchmark consisting of +51 chemistry papers published in Nature, Science, or a similar level in 2024 +(all papers are only available online since 2024). Every paper is divided by +chemistry PhD students into three components: background, inspirations, and +hypothesis. The goal is to rediscover the hypothesis, given only the background +and a large randomly selected chemistry literature corpus consisting the ground +truth inspiration papers, with LLMs trained with data up to 2023. We also +develop an LLM-based multi-agent framework that leverages the assumption, +consisting of three stages reflecting the three smaller questions. The proposed +method can rediscover many hypotheses with very high similarity with the ground +truth ones, covering the main innovations. + +
+
+ comment: Accepted by ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Text-driven Adaptation of Foundation Models for Few-shot Surgical + Workflow Analysis + + +
+ Purpose: Surgical workflow analysis is crucial for improving surgical +efficiency and safety. However, previous studies rely heavily on large-scale +annotated datasets, posing challenges in cost, scalability, and reliance on +expert annotations. To address this, we propose Surg-FTDA (Few-shot Text-driven +Adaptation), designed to handle various surgical workflow analysis tasks with +minimal paired image-label data. + Methods: Our approach has two key components. First, Few-shot selection-based +modality alignment selects a small subset of images and aligns their embeddings +with text embeddings from the downstream task, bridging the modality gap. +Second, Text-driven adaptation leverages only text data to train a decoder, +eliminating the need for paired image-text data. This decoder is then applied +to aligned image embeddings, enabling image-related tasks without explicit +image-text pairs. + Results: We evaluate our approach to generative tasks (image captioning) and +discriminative tasks (triplet recognition and phase recognition). Results show +that Surg-FTDA outperforms baselines and generalizes well across downstream +tasks. + Conclusion: We propose a text-driven adaptation approach that mitigates the +modality gap and handles multiple downstream tasks in surgical workflow +analysis, with minimal reliance on large annotated datasets. The code and +dataset will be released in https://github.com/CAMMA-public/Surg-FTDA + +
+
+
+
+
+ + ♻ ☆ NavRAG: Generating User Demand Instructions for Embodied Navigation + through Retrieval-Augmented LLM + + +
+ Vision-and-Language Navigation (VLN) is an essential skill for embodied +agents, allowing them to navigate in 3D environments following natural language +instructions. High-performance navigation models require a large amount of +training data, the high cost of manually annotating data has seriously hindered +this field. Therefore, some previous methods translate trajectory videos into +step-by-step instructions for expanding data, but such instructions do not +match well with users' communication styles that briefly describe destinations +or state specific needs. Moreover, local navigation trajectories overlook +global context and high-level task planning. To address these issues, we +propose NavRAG, a retrieval-augmented generation (RAG) framework that generates +user demand instructions for VLN. NavRAG leverages LLM to build a hierarchical +scene description tree for 3D scene understanding from global layout to local +details, then simulates various user roles with specific demands to retrieve +from the scene tree, generating diverse instructions with LLM. We annotate over +2 million navigation instructions across 861 scenes and evaluate the data +quality and navigation performance of trained models. + +
+
+
+
+
+ + ♻ ☆ HiLo: A Learning Framework for Generalized Category Discovery Robust to + Domain Shifts ICLR 2025 + + +
+ Generalized Category Discovery (GCD) is a challenging task in which, given a +partially labelled dataset, models must categorize all unlabelled instances, +regardless of whether they come from labelled categories or from new ones. In +this paper, we challenge a remaining assumption in this task: that all images +share the same domain. Specifically, we introduce a new task and method to +handle GCD when the unlabelled data also contains images from different domains +to the labelled set. Our proposed `HiLo' networks extract High-level semantic +and Low-level domain features, before minimizing the mutual information between +the representations. Our intuition is that the clusterings based on domain +information and semantic information should be independent. We further extend +our method with a specialized domain augmentation tailored for the GCD task, as +well as a curriculum learning approach. Finally, we construct a benchmark from +corrupted fine-grained datasets as well as a large-scale evaluation on +DomainNet with real-world domain shifts, reimplementing a number of GCD +baselines in this setting. We demonstrate that HiLo outperforms SoTA category +discovery models by a large margin on all evaluations. + +
+
+ comment: v2: Accepted as a conference paper at ICLR 2025; Project page: + https://github.com/Visual-AI/hilo/ +
+
+
+
+
+ + ♻ ☆ Optimal Brain Apoptosis ICLR 2025 + + +
+ The increasing complexity and parameter count of Convolutional Neural +Networks (CNNs) and Transformers pose challenges in terms of computational +efficiency and resource demands. Pruning has been identified as an effective +strategy to address these challenges by removing redundant elements such as +neurons, channels, or connections, thereby enhancing computational efficiency +without heavily compromising performance. This paper builds on the foundational +work of Optimal Brain Damage (OBD) by advancing the methodology of parameter +importance estimation using the Hessian matrix. Unlike previous approaches that +rely on approximations, we introduce Optimal Brain Apoptosis (OBA), a novel +pruning method that calculates the Hessian-vector product value directly for +each parameter. By decomposing the Hessian matrix across network layers and +identifying conditions under which inter-layer Hessian submatrices are +non-zero, we propose a highly efficient technique for computing the +second-order Taylor expansion of parameters. This approach allows for a more +precise pruning process, particularly in the context of CNNs and Transformers, +as validated in our experiments including VGG19, ResNet32, ResNet50, and +ViT-B/16 on CIFAR10, CIFAR100 and Imagenet datasets. Our code is available at +https://github.com/NEU-REAL/OBA. + +
+
+ comment: Accepted to ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Adaptive $Q$-Network: On-the-fly Target Selection for Deep Reinforcement + Learning ICLR + + +
+ Deep Reinforcement Learning (RL) is well known for being highly sensitive to +hyperparameters, requiring practitioners substantial efforts to optimize them +for the problem at hand. This also limits the applicability of RL in real-world +scenarios. In recent years, the field of automated Reinforcement Learning +(AutoRL) has grown in popularity by trying to address this issue. However, +these approaches typically hinge on additional samples to select +well-performing hyperparameters, hindering sample-efficiency and practicality. +Furthermore, most AutoRL methods are heavily based on already existing AutoML +methods, which were originally developed neglecting the additional challenges +inherent to RL due to its non-stationarities. In this work, we propose a new +approach for AutoRL, called Adaptive $Q$-Network (AdaQN), that is tailored to +RL to take into account the non-stationarity of the optimization procedure +without requiring additional samples. AdaQN learns several $Q$-functions, each +one trained with different hyperparameters, which are updated online using the +$Q$-function with the smallest approximation error as a shared target. Our +selection scheme simultaneously handles different hyperparameters while coping +with the non-stationarity induced by the RL optimization procedure and being +orthogonal to any critic-based RL algorithm. We demonstrate that AdaQN is +theoretically sound and empirically validate it in MuJoCo control problems and +Atari $2600$ games, showing benefits in sample-efficiency, overall performance, +robustness to stochasticity and training stability. + +
+
+ comment: Accepted at ICLR https://iclr.cc/virtual/2025/poster/28508 +
+
+
+
+
+ + ♻ ☆ Offline Model-Based Optimization by Learning to Rank ICLR 2025 + + +
+ Offline model-based optimization (MBO) aims to identify a design that +maximizes a black-box function using only a fixed, pre-collected dataset of +designs and their corresponding scores. A common approach in offline MBO is to +train a regression-based surrogate model by minimizing mean squared error (MSE) +and then find the best design within this surrogate model by different +optimizers (e.g., gradient ascent). However, a critical challenge is the risk +of out-of-distribution errors, i.e., the surrogate model may typically +overestimate the scores and mislead the optimizers into suboptimal regions. +Prior works have attempted to address this issue in various ways, such as using +regularization techniques and ensemble learning to enhance the robustness of +the model, but it still remains. In this paper, we argue that regression models +trained with MSE are not well-aligned with the primary goal of offline MBO, +which is to select promising designs rather than to predict their scores +precisely. Notably, if a surrogate model can maintain the order of candidate +designs based on their relative score relationships, it can produce the best +designs even without precise predictions. To validate it, we conduct +experiments to compare the relationship between the quality of the final +designs and MSE, finding that the correlation is really very weak. In contrast, +a metric that measures order-maintaining quality shows a significantly stronger +correlation. Based on this observation, we propose learning a ranking-based +model that leverages learning to rank techniques to prioritize promising +designs based on their relative scores. We show that the generalization error +on ranking loss can be well bounded. Empirical results across diverse tasks +demonstrate the superior performance of our proposed ranking-based models than +twenty existing methods. + +
+
+ comment: ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Variational Best-of-N Alignment + + +
+ Best-of-N (BoN) is a popular and effective algorithm for aligning language +models to human preferences. The algorithm works as follows: at inference time, +N samples are drawn from the language model, and the sample with the highest +reward, as judged by a reward model, is returned as the output. Despite its +effectiveness, BoN is computationally expensive; it reduces sampling throughput +by a factor of N. To make BoN more efficient at inference time, one strategy is +to fine-tune the language model to mimic what BoN does during inference. To +achieve this, we derive the distribution induced by the BoN algorithm. We then +propose to fine-tune the language model to minimize backward KL divergence to +the BoN distribution. Our approach is analogous to mean-field variational +inference and, thus, we term it variational BoN (vBoN). To the extent this +fine-tuning is successful and we end up with a good approximation, we have +reduced the inference cost by a factor of N. Our experiments on controlled +generation and summarization tasks show that BoN is the most effective +alignment method, and our variational approximation to BoN achieves the closest +performance to BoN and surpasses models fine-tuned using the standard +KL-constrained RL objective. In the controlled generation task, vBoN appears +more frequently on the Pareto frontier of reward and KL divergence compared to +other alignment methods. In the summarization task, vBoN achieves high reward +values across various sampling temperatures. + +
+
+
+
+
+ + ♻ ☆ Offload Rethinking by Cloud Assistance for Efficient Environmental Sound + Recognition on LPWANs + + +
+ Learning-based environmental sound recognition has emerged as a crucial +method for ultra-low-power environmental monitoring in biological research and +city-scale sensing systems. These systems usually operate under limited +resources and are often powered by harvested energy in remote areas. Recent +efforts in on-device sound recognition suffer from low accuracy due to resource +constraints, whereas cloud offloading strategies are hindered by high +communication costs. In this work, we introduce ORCA, a novel +resource-efficient cloud-assisted environmental sound recognition system on +batteryless devices operating over the Low-Power Wide-Area Networks (LPWANs), +targeting wide-area audio sensing applications. We propose a cloud assistance +strategy that remedies the low accuracy of on-device inference while minimizing +the communication costs for cloud offloading. By leveraging a +self-attention-based cloud sub-spectral feature selection method to facilitate +efficient on-device inference, ORCA resolves three key challenges for +resource-constrained cloud offloading over LPWANs: 1) high communication costs +and low data rates, 2) dynamic wireless channel conditions, and 3) unreliable +offloading. We implement ORCA on an energy-harvesting batteryless +microcontroller and evaluate it in a real world urban sound testbed. Our +results show that ORCA outperforms state-of-the-art methods by up to $80 +\times$ in energy savings and $220 \times$ in latency reduction while +maintaining comparable accuracy. + +
+
+ comment: Accepted by The 23rd ACM Conference on Embedded Networked Sensor + Systems (SenSys '25) +
+
+
+
+
+ + ♻ ☆ Exploring Iterative Controllable Summarization with Large Language + Models + + +
+ Large language models (LLMs) have demonstrated remarkable performance in +abstractive summarization tasks. However, their ability to precisely control +summary attributes (e.g., length or topic) remains underexplored, limiting +their adaptability to specific user preferences. In this paper, we +systematically explore the controllability of LLMs. To this end, we revisit +summary attribute measurements and introduce iterative evaluation metrics, +failure rate and average iteration count to precisely evaluate controllability +of LLMs, rather than merely assessing errors. Our findings show that LLMs +struggle more with numerical attributes than with linguistic attributes. To +address this challenge, we propose a guide-to-explain framework (GTE) for +controllable summarization. Our GTE framework enables the model to identify +misaligned attributes in the initial draft and guides it in self-explaining +errors in the previous output. By allowing the model to reflect on its +misalignment, GTE generates well-adjusted summaries that satisfy the desired +attributes with robust effectiveness, requiring surprisingly fewer iterations +than other iterative approaches. + +
+
+
+
+
+ + ♻ ☆ Foundation Models -- A Panacea for Artificial Intelligence in Pathology? + + +
+ The role of artificial intelligence (AI) in pathology has evolved from aiding +diagnostics to uncovering predictive morphological patterns in whole slide +images (WSIs). Recently, foundation models (FMs) leveraging self-supervised +pre-training have been widely advocated as a universal solution for diverse +downstream tasks. However, open questions remain about their clinical +applicability and generalization advantages over end-to-end learning using +task-specific (TS) models. Here, we focused on AI with clinical-grade +performance for prostate cancer diagnosis and Gleason grading. We present the +largest validation of AI for this task, using over 100,000 core needle biopsies +from 7,342 patients across 15 sites in 11 countries. We compared two FMs with a +fully end-to-end TS model in a multiple instance learning framework. Our +findings challenge assumptions that FMs universally outperform TS models. While +FMs demonstrated utility in data-scarce scenarios, their performance converged +with - and was in some cases surpassed by - TS models when sufficient labeled +training data were available. Notably, extensive task-specific training +markedly reduced clinically significant misgrading, misdiagnosis of challenging +morphologies, and variability across different WSI scanners. Additionally, FMs +used up to 35 times more energy than the TS model, raising concerns about their +sustainability. Our results underscore that while FMs offer clear advantages +for rapid prototyping and research, their role as a universal solution for +clinically applicable medical AI remains uncertain. For high-stakes clinical +applications, rigorous validation and consideration of task-specific training +remain critically important. We advocate for integrating the strengths of FMs +and end-to-end learning to achieve robust and resource-efficient AI pathology +solutions fit for clinical use. + +
+
+ comment: 50 pages, 15 figures and an appendix (study protocol) which is + previously published, see https://doi.org/10.1101/2024.07.04.24309948; + updated authors list format +
+
+
+
+
+ + ♻ ☆ TAG: A Decentralized Framework for Multi-Agent Hierarchical + Reinforcement Learning + + +
+ Hierarchical organization is fundamental to biological systems and human +societies, yet artificial intelligence systems often rely on monolithic +architectures that limit adaptability and scalability. Current hierarchical +reinforcement learning (HRL) approaches typically restrict hierarchies to two +levels or require centralized training, which limits their practical +applicability. We introduce TAME Agent Framework (TAG), a framework for +constructing fully decentralized hierarchical multi-agent systems.TAG enables +hierarchies of arbitrary depth through a novel LevelEnv concept, which +abstracts each hierarchy level as the environment for the agents above it. This +approach standardizes information flow between levels while preserving loose +coupling, allowing for seamless integration of diverse agent types. We +demonstrate the effectiveness of TAG by implementing hierarchical architectures +that combine different RL agents across multiple levels, achieving improved +performance over classical multi-agent RL baselines on standard benchmarks. Our +results show that decentralized hierarchical organization enhances both +learning speed and final performance, positioning TAG as a promising direction +for scalable multi-agent systems. + +
+
+
+
+
+ + ♻ ☆ The PanAf-FGBG Dataset: Understanding the Impact of Backgrounds in + Wildlife Behaviour Recognition + + +
+ Computer vision analysis of camera trap video footage is essential for +wildlife conservation, as captured behaviours offer some of the earliest +indicators of changes in population health. Recently, several high-impact +animal behaviour datasets and methods have been introduced to encourage their +use; however, the role of behaviour-correlated background information and its +significant effect on out-of-distribution generalisation remain unexplored. In +response, we present the PanAf-FGBG dataset, featuring 20 hours of wild +chimpanzee behaviours, recorded at over 350 individual camera locations. +Uniquely, it pairs every video with a chimpanzee (referred to as a foreground +video) with a corresponding background video (with no chimpanzee) from the same +camera location. We present two views of the dataset: one with overlapping +camera locations and one with disjoint locations. This setup enables, for the +first time, direct evaluation of in-distribution and out-of-distribution +conditions, and for the impact of backgrounds on behaviour recognition models +to be quantified. All clips come with rich behavioural annotations and metadata +including unique camera IDs and detailed textual scene descriptions. +Additionally, we establish several baselines and present a highly effective +latent-space normalisation technique that boosts out-of-distribution +performance by +5.42% mAP for convolutional and +3.75% mAP for +transformer-based models. Finally, we provide an in-depth analysis on the role +of backgrounds in out-of-distribution behaviour recognition, including the so +far unexplored impact of background durations (i.e., the count of background +frames within foreground videos). + +
+
+ comment: Accepted at the IEEE / CVF Computer Vision and Pattern Recognition + Conference 2025 +
+
+
+
+
+ + ♻ ☆ Slowing Down Forgetting in Continual Learning + + +
+ A common challenge in continual learning (CL) is catastrophic forgetting, +where the performance on old tasks drops after new, additional tasks are +learned. In this paper, we propose a novel framework called ReCL to slow down +forgetting in CL. Our framework exploits an implicit bias of gradient-based +neural networks due to which these converge to margin maximization points. Such +convergence points allow us to reconstruct old data from previous tasks, which +we then combine with the current training data. Our framework is flexible and +can be applied on top of existing, state-of-the-art CL methods. We further +demonstrate the performance gain from our framework across a large series of +experiments, including two challenging CL scenarios (class incremental and +domain incremental learning), different datasets (MNIST, CIFAR10, +TinyImagenet), and different network architectures. Across all experiments, we +find large performance gains through ReCL. To the best of our knowledge, our +framework is the first to address catastrophic forgetting by leveraging models +in CL as their own memory buffers. + +
+
+
+
+
+ + ♻ ☆ Causality Is Key to Understand and Balance Multiple Goals in Trustworthy + ML and Foundation Models + + +
+ Ensuring trustworthiness in machine learning (ML) systems is crucial as they +become increasingly embedded in high-stakes domains. This paper advocates for +integrating causal methods into machine learning to navigate the trade-offs +among key principles of trustworthy ML, including fairness, privacy, +robustness, accuracy, and explainability. While these objectives should ideally +be satisfied simultaneously, they are often addressed in isolation, leading to +conflicts and suboptimal solutions. Drawing on existing applications of +causality in ML that successfully align goals such as fairness and accuracy or +privacy and robustness, this paper argues that a causal approach is essential +for balancing multiple competing objectives in both trustworthy ML and +foundation models. Beyond highlighting these trade-offs, we examine how +causality can be practically integrated into ML and foundation models, offering +solutions to enhance their reliability and interpretability. Finally, we +discuss the challenges, limitations, and opportunities in adopting causal +frameworks, paving the way for more accountable and ethically sound AI systems. + +
+
+
+
+
+ + ♻ ☆ "Nuclear Deployed!": Analyzing Catastrophic Risks in Decision-making of + Autonomous LLM Agents + + +
+ Large language models (LLMs) are evolving into autonomous decision-makers, +raising concerns about catastrophic risks in high-stakes scenarios, +particularly in Chemical, Biological, Radiological and Nuclear (CBRN) domains. +Based on the insight that such risks can originate from trade-offs between the +agent's Helpful, Harmlessness and Honest (HHH) goals, we build a novel +three-stage evaluation framework, which is carefully constructed to effectively +and naturally expose such risks. We conduct 14,400 agentic simulations across +12 advanced LLMs, with extensive experiments and analysis. Results reveal that +LLM agents can autonomously engage in catastrophic behaviors and deception, +without being deliberately induced. Furthermore, stronger reasoning abilities +often increase, rather than mitigate, these risks. We also show that these +agents can violate instructions and superior commands. On the whole, we +empirically prove the existence of catastrophic risks in autonomous LLM agents. +We will release our code upon request. + +
+
+ comment: Please visit https://llm-catastrophic-risks.github.io for a quick + tour of our project +
+
+
+
+
+ + ♻ ☆ Improving Representation of High-frequency Components for Medical Visual + Foundation Models + + +
+ Foundation models have recently attracted significant attention for their +impressive generalizability across diverse downstream tasks. However, these +models are demonstrated to exhibit great limitations in representing +high-frequency components and fine-grained details. In many medical imaging +tasks, the precise representation of such information is crucial due to the +inherently intricate anatomical structures, sub-visual features, and complex +boundaries involved. Consequently, the limited representation of prevalent +foundation models can result in significant performance degradation or even +failure in these tasks. To address these challenges, we propose a novel +pretraining strategy, named Frequency-advanced Representation Autoencoder +(Frepa). Through high-frequency masking and low-frequency perturbation combined +with adversarial learning, Frepa encourages the encoder to effectively +represent and preserve high-frequency components in the image embeddings. +Additionally, we introduce an innovative histogram-equalized image masking +strategy, extending the Masked Autoencoder approach beyond ViT to other +architectures such as Swin Transformer and convolutional networks. We develop +Frepa across nine medical modalities and validate it on 32 downstream tasks for +both 2D images and 3D volume data. Without fine-tuning, Frepa can outperform +other self-supervised pretraining methods and, in some cases, even surpasses +task-specific trained models. This improvement is particularly significant for +tasks involving fine-grained details, such as achieving up to a +15% increase +in DSC for retina vessel segmentation and a +7% increase in IoU for lung nodule +detection. Further experiments quantitatively reveal that Frepa enables +superior high-frequency representations and preservation in the embeddings, +underscoring its potential for developing more generalized and universal +medical image foundation models. + +
+
+
+
+
+ + ♻ ☆ Enhancing Large Language Models with Pseudo- and Multisource- Knowledge + Graphs for Open-ended Question Answering + + +
+ Mitigating the hallucinations of Large Language Models is a crucial task. +Although some existing methods employ self-enhancement techniques, they fall +short of effectively addressing unknown factual hallucinations. Meanwhile, +Knowledge Graph (KG) enhancement approaches fail to address the generalization +across different KG sources and the enhancement of open-ended answer questions +simultaneously. To tackle these limitations, we propose a framework that +combines Pseudo-Graph Generation and Atomic Knowledge Verification (PG\&AKV). +Enhancement of open-ended question-answering begins with leveraging the +Pseudo-Graph Generation to provide the related knowledge framework. +Subsequently, Atomic Knowledge Verification utilizes atomic-level knowledge +querying and verification to achieve generalizability under different KG +sources. Compared to the baseline, this approach yields a minimum improvement +of 11.5 in the ROUGE-L score for open-ended questions. For precise-answered +questions, we observe a minimum accuracy improvement of 7.5%. Moreover, PG\&AKV +also exhibits generalizability across different KG sources. Utilizing KG +different from the question sources, PG\&AKV can even achieve at least a 3.5 % +performance improvement. In summary, our results pave the way for enhancing +LLMs by incorporating Pseudo- and Multisource-KGs, particularly in the filed of +open-ended questions. + +
+
+
+
+
+ + ♻ ☆ ECLeKTic: a Novel Challenge Set for Evaluation of Cross-Lingual + Knowledge Transfer + + +
+ To achieve equitable performance across languages, multilingual large +language models (LLMs) must be able to abstract knowledge beyond the language +in which it was acquired. However, the current literature lacks reliable ways +to measure LLMs' capability of cross-lingual knowledge transfer. To that end, +we present ECLeKTic, a multilingual closed-book QA (CBQA) dataset that +Evaluates Cross-Lingual Knowledge Transfer in a simple, black-box manner. We +detected information with uneven coverage across languages by controlling for +presence and absence of Wikipedia articles in 12 languages. We generated +knowledge-seeking questions in a source language, for which the answer appears +in a relevant Wikipedia article and translated them to all other 11 languages, +for which the respective Wikipedias lack equivalent articles. Assuming that +Wikipedia reflects the prominent knowledge in the LLM's training data, to solve +ECLeKTic's CBQA task the model is required to transfer knowledge between +languages. Experimenting with 8 LLMs, we show that SOTA models struggle to +effectively share knowledge across, languages even if they can predict the +answer well for queries in the same language the knowledge was acquired in. + +
+
+
+
+
+ + ♻ ☆ HiBug2: Efficient and Interpretable Error Slice Discovery for + Comprehensive Model Debugging + + +
+ Despite the significant success of deep learning models in computer vision, +they often exhibit systematic failures on specific data subsets, known as error +slices. Identifying and mitigating these error slices is crucial to enhancing +model robustness and reliability in real-world scenarios. In this paper, we +introduce HiBug2, an automated framework for error slice discovery and model +repair. HiBug2 first generates task-specific visual attributes to highlight +instances prone to errors through an interpretable and structured process. It +then employs an efficient slice enumeration algorithm to systematically +identify error slices, overcoming the combinatorial challenges that arise +during slice exploration. Additionally, HiBug2 extends its capabilities by +predicting error slices beyond the validation set, addressing a key limitation +of prior approaches. Extensive experiments across multiple domains, including +image classification, pose estimation, and object detection - show that HiBug2 +not only improves the coherence and precision of identified error slices but +also significantly enhances the model repair capabilities. + +
+
+
+
+
+ + ♻ ☆ ReFocus: Reinforcing Mid-Frequency and Key-Frequency Modeling for + Multivariate Time Series Forecasting + + +
+ Recent advancements have progressively incorporated frequency-based +techniques into deep learning models, leading to notable improvements in +accuracy and efficiency for time series analysis tasks. However, the +Mid-Frequency Spectrum Gap in the real-world time series, where the energy is +concentrated at the low-frequency region while the middle-frequency band is +negligible, hinders the ability of existing deep learning models to extract the +crucial frequency information. Additionally, the shared Key-Frequency in +multivariate time series, where different time series share indistinguishable +frequency patterns, is rarely exploited by existing literature. This work +introduces a novel module, Adaptive Mid-Frequency Energy Optimizer, based on +convolution and residual learning, to emphasize the significance of +mid-frequency bands. We also propose an Energy-based Key-Frequency Picking +Block to capture shared Key-Frequency, which achieves superior inter-series +modeling performance with fewer parameters. A novel Key-Frequency Enhanced +Training strategy is employed to further enhance Key-Frequency modeling, where +spectral information from other channels is randomly introduced into each +channel. Our approach advanced multivariate time series forecasting on the +challenging Traffic, ECL, and Solar benchmarks, reducing MSE by 4%, 6%, and 5% +compared to the previous SOTA iTransformer. Code is available at this GitHub +Repository: https://github.com/Levi-Ackman/ReFocus. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ Deep Learning-Driven Malware Classification with API Call Sequence + Analysis and Concept Drift Handling + + +
+ Malware classification in dynamic environments presents a significant +challenge due to concept drift, where the statistical properties of malware +data evolve over time, complicating detection efforts. To address this issue, +we propose a deep learning framework enhanced with a genetic algorithm to +improve malware classification accuracy and adaptability. Our approach +incorporates mutation operations and fitness score evaluations within genetic +algorithms to continuously refine the deep learning model, ensuring robustness +against evolving malware threats. Experimental results demonstrate that this +hybrid method significantly enhances classification performance and +adaptability, outperforming traditional static models. Our proposed approach +offers a promising solution for real-time malware classification in +ever-changing cybersecurity landscapes. + +
+
+
+
+
+ + ♻ ☆ Inference Scaling Laws: An Empirical Analysis of Compute-Optimal + Inference for Problem-Solving with Language Models + + +
+ While the scaling laws of large language models (LLMs) training have been +extensively studied, optimal inference configurations of LLMs remain +underexplored. We study inference scaling laws (aka test-time scaling laws) and +compute-optimal inference, focusing on the trade-offs between model sizes and +generating additional tokens with different inference strategies. As a first +step towards understanding and designing compute-optimal inference methods, we +studied cost-performance trade-offs for inference strategies such as greedy +search, majority voting, best-of-$n$, weighted voting, and two different tree +search algorithms, using different model sizes and compute budgets. Our +findings suggest that scaling inference compute with inference strategies can +be more computationally efficient than scaling model parameters. Additionally, +smaller models combined with advanced inference algorithms offer Pareto-optimal +trade-offs in cost and performance. For example, the Llemma-7B model, when +paired with our novel tree search algorithm, consistently outperforms the +Llemma-34B model across all tested inference strategies on the MATH benchmark. +We hope these insights contribute to a deeper understanding of inference +scaling laws (test-time scaling laws) for LLMs. + +
+
+
+
+
+ + ♻ ☆ Will AI replace Software Engineers? Do not hold your breath + + +
+ Artificial Intelligence (AI) technology such as Large Language Models (LLMs) +have become extremely popular in creating code. This has led to the conjecture +that future software jobs will be exclusively conducted by LLMs, and the +software industry will cease to exist. But software engineering is much more +than producing code -- notably, \emph{maintaining} large software and keeping +it reliable is a major part of software engineering, which LLMs are not yet +capable of. + +
+
+ comment: 3 pages +
+
+
+
+
+ + ♻ ☆ PAPILLON: Efficient and Stealthy Fuzz Testing-Powered Jailbreaks for + LLMs + + +
+ Large Language Models (LLMs) have excelled in various tasks but are still +vulnerable to jailbreaking attacks, where attackers create jailbreak prompts to +mislead the model to produce harmful or offensive content. Current jailbreak +methods either rely heavily on manually crafted templates, which pose +challenges in scalability and adaptability, or struggle to generate +semantically coherent prompts, making them easy to detect. Additionally, most +existing approaches involve lengthy prompts, leading to higher query costs. In +this paper, to remedy these challenges, we introduce a novel jailbreaking +attack framework called PAPILLON, which is an automated, black-box jailbreaking +attack framework that adapts the black-box fuzz testing approach with a series +of customized designs. Instead of relying on manually crafted +templates,PAPILLON starts with an empty seed pool, removing the need to search +for any related jailbreaking templates. We also develop three novel +question-dependent mutation strategies using an LLM helper to generate prompts +that maintain semantic coherence while significantly reducing their length. +Additionally, we implement a two-level judge module to accurately detect +genuine successful jailbreaks. We evaluated PAPILLON on 7 representative LLMs +and compared it with 5 state-of-the-art jailbreaking attack strategies. For +proprietary LLM APIs, such as GPT-3.5 turbo, GPT-4, and Gemini-Pro, PAPILLONs +achieves attack success rates of over 90%, 80%, and 74%, respectively, +exceeding existing baselines by more than 60\%. Additionally, PAPILLON can +maintain high semantic coherence while significantly reducing the length of +jailbreak prompts. When targeting GPT-4, PAPILLON can achieve over 78% attack +success rate even with 100 tokens. Moreover, PAPILLON demonstrates +transferability and is robust to state-of-the-art defenses. Code: +https://github.com/aaFrostnova/Papillon + +
+
+
+
+
+ + ♻ ☆ DailyDilemmas: Revealing Value Preferences of LLMs with Quandaries of + Daily Life ICLR 2025 + + +
+ As users increasingly seek guidance from LLMs for decision-making in daily +life, many of these decisions are not clear-cut and depend significantly on the +personal values and ethical standards of people. We present DailyDilemmas, a +dataset of 1,360 moral dilemmas encountered in everyday life. Each dilemma +presents two possible actions, along with affected parties and relevant human +values for each action. Based on these dilemmas, we gather a repository of +human values covering diverse everyday topics, such as interpersonal +relationships, workplace, and environmental issues. With DailyDilemmas, we +evaluate LLMs on these dilemmas to determine what action they will choose and +the values represented by these action choices. Then, we analyze values through +the lens of five theoretical frameworks inspired by sociology, psychology, and +philosophy, including the World Values Survey, Moral Foundations Theory, +Maslow's Hierarchy of Needs, Aristotle's Virtues, and Plutchik's Wheel of +Emotions. For instance, we find LLMs are most aligned with self-expression over +survival in World Values Survey and care over loyalty in Moral Foundations +Theory. Interestingly, we find substantial preference differences in models for +some core values. For example, for truthfulness, Mixtral-8x7B neglects it by +9.7% while GPT-4-turbo selects it by 9.4%. We also study the recent guidance +released by OpenAI (ModelSpec), and Anthropic (Constitutional AI) to understand +how their designated principles reflect their models' actual value +prioritization when facing nuanced moral reasoning in daily-life settings. +Finally, we find that end users cannot effectively steer such prioritization +using system prompts. + +
+
+ comment: Accepted into ICLR 2025 (spotlight) +
+
+
+
+
+ + ♻ ☆ Test-Time Compute: from System-1 Thinking to System-2 Thinking + + +
+ The remarkable performance of the o1 model in complex reasoning demonstrates +that test-time compute scaling can further unlock the model's potential, +enabling powerful System-2 thinking. However, there is still a lack of +comprehensive surveys for test-time compute scaling. We trace the concept of +test-time compute back to System-1 models. In System-1 models, test-time +compute addresses distribution shifts and improves robustness and +generalization through parameter updating, input modification, representation +editing, and output calibration. In System-2 models, it enhances the model's +reasoning ability to solve complex problems through repeated sampling, +self-correction, and tree search. We organize this survey according to the +trend of System-1 to System-2 thinking, highlighting the key role of test-time +compute in the transition from System-1 models to weak System-2 models, and +then to strong System-2 models. We also point out a few possible future +directions. + +
+
+ comment: work in progress +
+
+
+
+
+ + ♻ ☆ Learning to Align Multi-Faceted Evaluation: A Unified and Robust + Framework + + +
+ Large Language Models (LLMs) are being used more and more extensively for +automated evaluation in various scenarios. Previous studies have attempted to +fine-tune open-source LLMs to replicate the evaluation explanations and +judgments of powerful proprietary models, such as GPT-4. However, these methods +are largely limited to text-based analyses under predefined general criteria, +resulting in reduced adaptability for unseen instructions and demonstrating +instability in evaluating adherence to quantitative and structural constraints. +To address these limitations, we propose a novel evaluation framework, ARJudge, +that adaptively formulates evaluation criteria and synthesizes both text-based +and code-driven analyses to evaluate LLM responses. ARJudge consists of two +components: a fine-tuned Analyzer that generates multi-faceted evaluation +analyses and a tuning-free Refiner that combines and refines all analyses to +make the final judgment. We construct a Composite Analysis Corpus that +integrates tasks for evaluation criteria generation alongside text-based and +code-driven analysis generation to train the Analyzer. Our results demonstrate +that ARJudge outperforms existing fine-tuned evaluators in effectiveness and +robustness. Furthermore, it demonstrates the importance of multi-faceted +evaluation and code-driven analyses in enhancing evaluation capabilities. + +
+
+
+
+
+ + ♻ ☆ Subtle Errors Matter: Preference Learning via Error-injected + Self-editing + + +
+ Large Language Models (LLMs) have exhibited strong mathematical reasoning +prowess, tackling tasks ranging from basic arithmetic to advanced +competition-level problems. However, frequently occurring subtle yet critical +errors, such as miscalculations or incorrect substitutions, limit the LLMs' +full potential. Existing studies to improve mathematical ability typically +involve applying preference learning to step-wise solution pairs. Although +these methods leverage samples of varying granularity to mitigate reasoning +errors, they overlook critical subtle errors. In this work, we propose a novel +preference learning framework called eRror-Injected Self-Editing (RISE), which +injects predefined subtle errors into pivotal tokens in reasoning or +computation steps to construct hard pairs for error mitigation. In detail, RISE +uses the LLM itself to edit a small number of tokens in the solution, injecting +designed subtle errors. Then, pairs composed of self-edited solutions and their +corresponding correct ones, along with pairs of correct and incorrect solutions +obtained through sampling, are used together for subtle error-aware DPO +training. Compared with other preference learning methods, RISE further refines +the training objective without requiring fine-grained sampling or preference +annotation. Extensive experiments validate the effectiveness of RISE, with +preference learning on Qwen2-7B-Instruct yielding notable improvements of 3.0% +on GSM8K and 7.9% on MATH with only 4.5K training samples. Moreover, the effect +of error mitigation extends from mathematical reasoning to logical reasoning +and code generation. + +
+
+
+
+
+ + ♻ ☆ SheetAgent: Towards A Generalist Agent for Spreadsheet Reasoning and + Manipulation via Large Language Models WWW + + +
+ Spreadsheets are ubiquitous across the World Wide Web, playing a critical +role in enhancing work efficiency across various domains. Large language model +(LLM) has been recently attempted for automatic spreadsheet manipulation but +has not yet been investigated in complicated and realistic tasks where +reasoning challenges exist (e.g., long horizon manipulation with multi-step +reasoning and ambiguous requirements). To bridge the gap with the real-world +requirements, we introduce SheetRM, a benchmark featuring long-horizon and +multi-category tasks with reasoning-dependent manipulation caused by real-life +challenges. To mitigate the above challenges, we further propose SheetAgent, a +novel autonomous agent that utilizes the power of LLMs. SheetAgent consists of +three collaborative modules: Planner, Informer, and Retriever, achieving both +advanced reasoning and accurate manipulation over spreadsheets without human +interaction through iterative task reasoning and reflection. Extensive +experiments demonstrate that SheetAgent delivers 20--40\% pass rate +improvements on multiple benchmarks over baselines, achieving enhanced +precision in spreadsheet manipulation and demonstrating superior table +reasoning abilities. More details and visualizations are available at the +project website: https://sheetagent.github.io/. The datasets and source code +are available at https://anonymous.4open.science/r/SheetAgent. + +
+
+ comment: Accepted by International World Wide Web Conference (WWW) 2025 (oral) +
+
+
+
+
+ + ♻ ☆ Understanding LLMs' Fluid Intelligence Deficiency: An Analysis of the + ARC Task NAACL 2025 + + +
+ While LLMs have exhibited strong performance on various NLP tasks, it is +noteworthy that most of these tasks rely on utilizing the vast amount of +knowledge encoded in LLMs' parameters, rather than solving new problems without +prior knowledge. In cognitive research, the latter ability is referred to as +fluid intelligence, which is considered to be critical for assessing human +intelligence. Recent research on fluid intelligence assessments has highlighted +significant deficiencies in LLMs' abilities. In this paper, we analyze the +challenges LLMs face in demonstrating fluid intelligence through controlled +experiments, using the most representative ARC task as an example. Our study +revealed three major limitations in existing LLMs: limited ability for skill +composition, unfamiliarity with abstract input formats, and the intrinsic +deficiency of left-to-right decoding. Our data and code can be found in +https://wujunjie1998.github.io/araoc-benchmark.github.io/. + +
+
+ comment: 22 pages, 9 figures, accepted by NAACL 2025 main conference +
+
+
+
+
+ + ♻ ☆ RALAD: Bridging the Real-to-Sim Domain Gap in Autonomous Driving with + Retrieval-Augmented Learning + + +
+ In the pursuit of robust autonomous driving systems, models trained on +real-world datasets often struggle to adapt to new environments, particularly +when confronted with corner cases such as extreme weather conditions. +Collecting these corner cases in the real world is non-trivial, which +necessitates the use of simulators for validation. However,the high +computational cost and the domain gap in data distribution have hindered the +seamless transition between real and simulated driving scenarios. To tackle +this challenge, we propose Retrieval-Augmented Learning for Autonomous Driving +(RALAD), a novel framework designed to bridge the real-to-sim gap at a low +cost. RALAD features three primary designs, including (1) domain adaptation via +an enhanced Optimal Transport (OT) method that accounts for both individual and +grouped image distances, (2) a simple and unified framework that can be applied +to various models, and (3) efficient fine-tuning techniques that freeze the +computationally expensive layers while maintaining robustness. Experimental +results demonstrate that RALAD compensates for the performance degradation in +simulated environments while maintaining accuracy in real-world scenarios +across three different models. Taking Cross View as an example, the mIOU and +mAP metrics in real-world scenarios remain stable before and after RALAD +fine-tuning, while in simulated environments,the mIOU and mAP metrics are +improved by 10.30% and 12.29%, respectively. Moreover, the re-training cost of +our approach is reduced by approximately 88.1%. Our code is available at +https://github.com/JiachengZuo/RALAD.git. + +
+
+
+
+
+ + ♻ ☆ Long-Term EEG Partitioning for Seizure Onset Detection AAAI 2025 + + +
+ Deep learning models have recently shown great success in classifying +epileptic patients using EEG recordings. Unfortunately, classification-based +methods lack a sound mechanism to detect the onset of seizure events. In this +work, we propose a two-stage framework, SODor, that explicitly models seizure +onset through a novel task formulation of subsequence clustering. Given an EEG +sequence, the framework first learns a set of second-level embeddings with +label supervision. It then employs model-based clustering to explicitly capture +long-term temporal dependencies in EEG sequences and identify meaningful +subsequences. Epochs within a subsequence share a common cluster assignment +(normal or seizure), with cluster or state transitions representing successful +onset detections. Extensive experiments on three datasets demonstrate that our +method can correct misclassifications, achieving 5\%-11\% classification +improvements over other baselines and accurately detecting seizure onsets. + +
+
+ comment: Accepted at AAAI 2025 +
+
+
+
+
+ + ♻ ☆ SynGhost: Invisible and Universal Task-agnostic Backdoor Attack via + Syntactic Transfer NAACL 2025 + + +
+ Although pre-training achieves remarkable performance, it suffers from +task-agnostic backdoor attacks due to vulnerabilities in data and training +mechanisms. These attacks can transfer backdoors to various downstream tasks. +In this paper, we introduce $\mathtt{maxEntropy}$, an entropy-based poisoning +filter that mitigates such risks. To overcome the limitations of manual target +setting and explicit triggers, we propose $\mathtt{SynGhost}$, an invisible and +universal task-agnostic backdoor attack via syntactic transfer, further +exposing vulnerabilities in pre-trained language models (PLMs). Specifically, +$\mathtt{SynGhost}$ injects multiple syntactic backdoors into the pre-training +space through corpus poisoning, while preserving the PLM's pre-training +capabilities. Second, $\mathtt{SynGhost}$ adaptively selects optimal targets +based on contrastive learning, creating a uniform distribution in the +pre-training space. To identify syntactic differences, we also introduce an +awareness module to minimize interference between backdoors. Experiments show +that $\mathtt{SynGhost}$ poses significant threats and can transfer to various +downstream tasks. Furthermore, $\mathtt{SynGhost}$ resists defenses based on +perplexity, fine-pruning, and $\mathtt{maxEntropy}$. The code is available at +https://github.com/Zhou-CyberSecurity-AI/SynGhost. + +
+
+ comment: 17 pages, 16 figures, 12 tables, accepted at NAACL 2025 Findings +
+
+
+
+
+ + ♻ ☆ Cross-Spectral Vision Transformer for Biometric Authentication using + Forehead Subcutaneous Vein Pattern and Periocular Pattern + + +
+ Traditional biometric systems have encountered significant setbacks due to +various unavoidable factors, for example, face recognition-based biometrics +fails due to the wearing of face masks and fingerprints create hygiene +concerns. This paper proposes a novel lightweight cross-spectral vision +transformer (CS-ViT) for biometric authentication using forehead subcutaneous +vein patterns and periocular patterns, offering a promising alternative to +traditional methods, capable of performing well even with the face masks and +without any physical touch. The proposed framework comprises a cross-spectral +dual-channel architecture designed to handle two distinct biometric traits and +to capture inter-dependencies in terms of relative spectral patterns. Each +channel consists of a Phase-Only Correlation Cross-Spectral Attention (POC-CSA) +that captures their individual as well as correlated patterns. The computation +of cross-spectral attention using POC extracts the phase correlation in the +spatial features. Therefore, it is robust against the resolution/intensity +variations and illumination of the input images, assuming both biometric traits +are from the same person. The lightweight model is suitable for edge device +deployment. The performance of the proposed algorithm was rigorously evaluated +using the Forehead Subcutaneous Vein Pattern and Periocular Biometric Pattern +(FSVP-PBP) database. The results demonstrated the superiority of the algorithm +over state-of-the-art methods, achieving a remarkable classification accuracy +of 98.8% with the combined vein and periocular patterns. + +
+
+ comment: Submitted to IEEE TPAMI +
+
+
+
+
+ + ♻ ☆ Order Matters: Investigate the Position Bias in Multi-constraint + Instruction Following + + +
+ Real-world instructions with multiple constraints pose a significant +challenge to existing large language models (LLMs). An observation is that the +LLMs exhibit dramatic performance fluctuation when disturbing the order of the +incorporated constraints. Yet, none of the existing works has systematically +investigated this position bias problem in the field of multi-constraint +instruction following. To bridge this gap, we design a probing task where we +quantitatively measure the difficulty distribution of the constraints by a +novel Difficulty Distribution Index (CDDI). Through the experimental results, +we find that LLMs are more performant when presented with the constraints in a +``hard-to-easy'' order. This preference can be generalized to LLMs with +different architecture or different sizes of parameters. Additionally, we +conduct an explanation study, providing an intuitive insight into the +correlation between the LLM's attention and constraint orders. Our code and +dataset are publicly available at https://github.com/meowpass/PBIF. + +
+
+
+
+
+ + ♻ ☆ Representation Engineering: A Top-Down Approach to AI Transparency + + +
+ In this paper, we identify and characterize the emerging area of +representation engineering (RepE), an approach to enhancing the transparency of +AI systems that draws on insights from cognitive neuroscience. RepE places +population-level representations, rather than neurons or circuits, at the +center of analysis, equipping us with novel methods for monitoring and +manipulating high-level cognitive phenomena in deep neural networks (DNNs). We +provide baselines and an initial analysis of RepE techniques, showing that they +offer simple yet effective solutions for improving our understanding and +control of large language models. We showcase how these methods can provide +traction on a wide range of safety-relevant problems, including honesty, +harmlessness, power-seeking, and more, demonstrating the promise of top-down +transparency research. We hope that this work catalyzes further exploration of +RepE and fosters advancements in the transparency and safety of AI systems. + +
+
+ comment: Code is available at + https://github.com/andyzoujm/representation-engineering +
+
+
+
+
+ + ♻ ☆ TokenSelect: Efficient Long-Context Inference and Length Extrapolation + for LLMs via Dynamic Token-Level KV Cache Selection + + +
+ The rapid advancement of Large Language Models (LLMs) has driven growing +demand for processing extended context sequences in contemporary applications. +However, this progress faces two major challenges: performance degradation due +to sequence lengths out-of-distribution, and excessively long inference times +caused by the quadratic computational complexity of attention. These issues +hinder the application of LLMs in long-context scenarios. In this paper, we +propose Dynamic Token-Level KV Cache Selection (TokenSelect), a training-free +method for efficient and accurate long-context inference. TokenSelect builds +upon the observation of non-contiguous attention sparsity, using Query-Key dot +products to measure per-head KV Cache criticality at token-level. By per-head +soft voting mechanism, TokenSelect selectively involves a few critical KV cache +tokens in attention calculation without sacrificing accuracy. To further +accelerate TokenSelect, we design the Selection Cache based on observations of +consecutive Query similarity and implemented efficient dot product kernel, +significantly reducing the overhead. A comprehensive evaluation of TokenSelect +demonstrates up to 23.84x speedup in attention computation and up to 2.28x +acceleration in end-to-end latency, while providing superior performance +compared to state-of-the-art long-context inference methods. + +
+
+
+
+
+ + ♻ ☆ Structural-Entropy-Based Sample Selection for Efficient and Effective + Learning ICLR 2025 + + +
+ Sample selection improves the efficiency and effectiveness of machine +learning models by providing informative and representative samples. Typically, +samples can be modeled as a sample graph, where nodes are samples and edges +represent their similarities. Most existing methods are based on local +information, such as the training difficulty of samples, thereby overlooking +global information, such as connectivity patterns. This oversight can result in +suboptimal selection because global information is crucial for ensuring that +the selected samples well represent the structural properties of the graph. To +address this issue, we employ structural entropy to quantify global information +and losslessly decompose it from the whole graph to individual nodes using the +Shapley value. Based on the decomposition, we present +$\textbf{S}$tructural-$\textbf{E}$ntropy-based sample $\textbf{S}$election +($\textbf{SES}$), a method that integrates both global and local information to +select informative and representative samples. SES begins by constructing a +$k$NN-graph among samples based on their similarities. It then measures sample +importance by combining structural entropy (global metric) with training +difficulty (local metric). Finally, SES applies importance-biased blue noise +sampling to select a set of diverse and representative samples. Comprehensive +experiments on three learning scenarios -- supervised learning, active +learning, and continual learning -- clearly demonstrate the effectiveness of +our method. + +
+
+ comment: Published as a conference paper at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ PATCH: a deep learning method to assess heterogeneity of artistic + practice in historical paintings + + +
+ The history of art has seen significant shifts in the manner in which +artworks are created, making understanding of creative processes a central +question in technical art history. In the Renaissance and Early Modern period, +paintings were largely produced by master painters directing workshops of +apprentices who often contributed to projects. The masters varied significantly +in artistic and managerial styles, meaning different combinations of artists +and implements might be seen both between masters and within workshops or even +individual canvases. Information on how different workshops were managed and +the processes by which artworks were created remains elusive. Machine learning +methods have potential to unearth new information about artists' creative +processes by extending the analysis of brushwork to a microscopic scale. +Analysis of workshop paintings, however, presents a challenge in that +documentation of the artists and materials involved is sparse, meaning external +examples are not available to train networks to recognize their contributions. +Here we present a novel machine learning approach we call pairwise assignment +training for classifying heterogeneity (PATCH) that is capable of identifying +individual artistic practice regimes with no external training data, or "ground +truth." The method achieves unsupervised results by supervised means, and +outperforms both simple statistical procedures and unsupervised machine +learning methods. We apply this method to two historical paintings by the +Spanish Renaissance master, El Greco: The Baptism of Christ and Christ on the +Cross with Landscape, and our findings regarding the former potentially +challenge previous work that has assigned the painting to workshop members. +Further, the results of our analyses create a measure of heterogeneity of +artistic practice that can be used to characterize artworks across time and +space. + +
+
+ comment: main text: 16 pages, 6 figures; SI: 7 pages, 3 figures; v2: minor + typo corrections, higher resolution figures +
+
+
+
+
+ + ♻ ☆ Spontaneous Giving and Calculated Greed in Language Models + + +
+ Large language models demonstrate advanced problem-solving capabilities by +incorporating reasoning techniques such as chain of thought and reflection. +However, how these reasoning capabilities extend to social intelligence remains +unclear. In this study, we investigate this question using economic games that +model social dilemmas, where social intelligence plays a crucial role. First, +we examine the effects of chain-of-thought and reflection techniques in a +public goods game. We then extend our analysis to six economic games on +cooperation and punishment, comparing off-the-shelf non-reasoning and reasoning +models. We find that reasoning models significantly reduce cooperation and norm +enforcement, prioritizing individual rationality. Consequently, groups with +more reasoning models exhibit less cooperation and lower gains through repeated +interactions. These behaviors parallel human tendencies of "spontaneous giving +and calculated greed." Our results suggest the need for AI architectures that +incorporate social intelligence alongside reasoning capabilities to ensure that +AI supports, rather than disrupts, human cooperative intuition. + +
+
+
+
+
+ + ♻ ☆ Generative Representational Instruction Tuning + + +
+ All text-based language problems can be reduced to either generation or +embedding. Current models only perform well at one or the other. We introduce +generative representational instruction tuning (GRIT) whereby a large language +model is trained to handle both generative and embedding tasks by +distinguishing between them through instructions. Compared to other open +models, our resulting GritLM 7B sets a new state of the art on the Massive Text +Embedding Benchmark (MTEB) and outperforms all models up to its size on a range +of generative tasks. By scaling up further, GritLM 8x7B outperforms all open +generative language models that we tried while still being among the best +embedding models. Notably, we find that GRIT matches training on only +generative or embedding data, thus we can unify both at no performance loss. +Among other benefits, the unification via GRIT speeds up Retrieval-Augmented +Generation (RAG) by > 60% for long documents, by no longer requiring separate +retrieval and generation models. Models, code, etc. are freely available at +https://github.com/ContextualAI/gritlm. + +
+
+ comment: 67 pages (16 main), 25 figures, 34 tables +
+
+
+
+
+ + ♻ ☆ Federated Learning in Practice: Reflections and Projections + + +
+ Federated Learning (FL) is a machine learning technique that enables multiple +entities to collaboratively learn a shared model without exchanging their local +data. Over the past decade, FL systems have achieved substantial progress, +scaling to millions of devices across various learning domains while offering +meaningful differential privacy (DP) guarantees. Production systems from +organizations like Google, Apple, and Meta demonstrate the real-world +applicability of FL. However, key challenges remain, including verifying +server-side DP guarantees and coordinating training across heterogeneous +devices, limiting broader adoption. Additionally, emerging trends such as large +(multi-modal) models and blurred lines between training, inference, and +personalization challenge traditional FL frameworks. In response, we propose a +redefined FL framework that prioritizes privacy principles rather than rigid +definitions. We also chart a path forward by leveraging trusted execution +environments and open-source ecosystems to address these challenges and +facilitate future advancements in FL. + +
+
+ comment: Published at 2024 IEEE 6th International Conference on Trust, Privacy + and Security in Intelligent Systems, and Applications (TPS-ISA) +
+
+
+
+
+ + ♻ ☆ MV-MATH: Evaluating Multimodal Math Reasoning in Multi-Visual Contexts + + +
+ Multimodal Large Language Models (MLLMs) have shown promising capabilities in +mathematical reasoning within visual contexts across various datasets. However, +most existing multimodal math benchmarks are limited to single-visual contexts, +which diverges from the multi-visual scenarios commonly encountered in +real-world mathematical applications. To address this gap, we introduce +MV-MATH: a meticulously curated dataset of 2,009 high-quality mathematical +problems. Each problem integrates multiple images interleaved with text, +derived from authentic K-12 scenarios, and enriched with detailed annotations. +MV-MATH includes multiple-choice, free-form, and multi-step questions, covering +11 subject areas across 3 difficulty levels, and serves as a comprehensive and +rigorous benchmark for assessing MLLMs' mathematical reasoning in multi-visual +contexts. Through extensive experimentation, we observe that MLLMs encounter +substantial challenges in multi-visual math tasks, with a considerable +performance gap relative to human capabilities on MV-MATH. Furthermore, we +analyze the performance and error patterns of various models, providing +insights into MLLMs' mathematical reasoning capabilities within multi-visual +settings. + +
+
+ comment: 47 pages +
+
+
+
+
+ + ♻ ☆ Iterative Nash Policy Optimization: Aligning LLMs with General + Preferences via No-Regret Learning + + +
+ Reinforcement Learning with Human Feedback (RLHF) has achieved great success +in aligning large language models (LLMs) with human preferences. Prevalent RLHF +approaches are reward-based, following the Bradley-Terry (BT) model assumption, +which may not fully capture the complexity of human preferences. In this paper, +we explore RLHF under a general preference framework and approach it from a +game-theoretic perspective. Specifically, we formulate the problem as a +two-player game and propose a novel online algorithm, iterative Nash policy +optimization (INPO). The key idea is to let the policy play against itself via +no-regret learning, thereby approximating the Nash policy. Unlike previous +methods, INPO bypasses the need for estimating the expected win rate for +individual responses, which typically incurs high computational or annotation +costs. Instead, we introduce a new loss objective that is directly minimized +over a preference dataset. We provide theoretical analysis for our approach and +demonstrate its effectiveness through experiments on various representative +benchmarks. With an LLaMA-3-8B-based SFT model, INPO achieves a 42.6% +length-controlled win rate on AlpacaEval 2.0 and a 37.8% win rate on +Arena-Hard, showing substantial improvement over the state-of-the-art online +RLHF algorithms. + +
+
+
+
+
+ + ♻ ☆ Discovering physical laws with parallel combinatorial tree search + + +
+ Symbolic regression plays a crucial role in modern scientific research thanks +to its capability of discovering concise and interpretable mathematical +expressions from data. A grand challenge lies in the arduous search for +parsimonious and generalizable mathematical formulas, in an infinite search +space, while intending to fit the training data. Existing algorithms have faced +a critical bottleneck of accuracy and efficiency over a decade when handling +problems of complexity, which essentially hinders the pace of applying symbolic +regression for scientific exploration across interdisciplinary domains. To this +end, we introduce a parallel combinatorial tree search (PCTS) model to +efficiently distill generic mathematical expressions from limited data. Through +a series of extensive experiments, we demonstrate the superior accuracy and +efficiency of PCTS for equation discovery, which greatly outperforms the +state-of-the-art baseline models on over 200 synthetic and experimental +datasets (e.g., lifting its performance by up to 99% accuracy improvement and +one-order of magnitude speed up). PCTS represents a key advance in accurate and +efficient data-driven discovery of symbolic, interpretable models (e.g., +underlying physical laws) and marks a pivotal transition towards scalable +symbolic learning. + +
+
+
+
+
+ + ♻ ☆ Optimization-based Prompt Injection Attack to LLM-as-a-Judge CCS + + +
+ LLM-as-a-Judge uses a large language model (LLM) to select the best response +from a set of candidates for a given question. LLM-as-a-Judge has many +applications such as LLM-powered search, reinforcement learning with AI +feedback (RLAIF), and tool selection. In this work, we propose JudgeDeceiver, +an optimization-based prompt injection attack to LLM-as-a-Judge. JudgeDeceiver +injects a carefully crafted sequence into an attacker-controlled candidate +response such that LLM-as-a-Judge selects the candidate response for an +attacker-chosen question no matter what other candidate responses are. +Specifically, we formulate finding such sequence as an optimization problem and +propose a gradient based method to approximately solve it. Our extensive +evaluation shows that JudgeDeceive is highly effective, and is much more +effective than existing prompt injection attacks that manually craft the +injected sequences and jailbreak attacks when extended to our problem. We also +show the effectiveness of JudgeDeceiver in three case studies, i.e., +LLM-powered search, RLAIF, and tool selection. Moreover, we consider defenses +including known-answer detection, perplexity detection, and perplexity windowed +detection. Our results show these defenses are insufficient, highlighting the +urgent need for developing new defense strategies. Our implementation is +available at this repository: https://github.com/ShiJiawenwen/JudgeDeceiver. + +
+
+ comment: To appear in the Proceedings of The ACM Conference on Computer and + Communications Security (CCS), 2024 +
+
+
+
+
+ + ♻ ☆ Learning to Learn Weight Generation via Trajectory Diffusion + + +
+ Diffusion-based algorithms have emerged as promising techniques for weight +generation, particularly in scenarios like multi-task learning that require +frequent weight updates. However, existing solutions suffer from limited +cross-task transferability. In addition, they only utilize optimal weights as +training samples, ignoring the value of other weights in the optimization +process. To address these issues, we propose Lt-Di, which integrates the +diffusion algorithm with meta-learning to generate weights for unseen tasks. +Furthermore, we extend the vanilla diffusion algorithm into a trajectory +diffusion algorithm to utilize other weights along the optimization trajectory. +Trajectory diffusion decomposes the entire diffusion chain into multiple +shorter ones, improving training and inference efficiency. We analyze the +convergence properties of the weight generation paradigm and improve +convergence efficiency without additional time overhead. Our experiments +demonstrate Lt-Di's higher accuracy while reducing computational overhead +across various tasks, including zero-shot and few-shot learning, multi-domain +generalization, and large-scale language model fine-tuning.Our code is released +at https://anonymous.4open.science/r/Lt-Di-0E51. + +
+
+
+
+
+ + ♻ ☆ Multi-modal AI for comprehensive breast cancer prognostication + + +
+ Treatment selection in breast cancer is guided by molecular subtypes and +clinical characteristics. However, current tools including genomic assays lack +the accuracy required for optimal clinical decision-making. We developed a +novel artificial intelligence (AI)-based approach that integrates digital +pathology images with clinical data, providing a more robust and effective +method for predicting the risk of cancer recurrence in breast cancer patients. +Specifically, we utilized a vision transformer pan-cancer foundation model +trained with self-supervised learning to extract features from digitized +H&E-stained slides. These features were integrated with clinical data to form a +multi-modal AI test predicting cancer recurrence and death. The test was +developed and evaluated using data from a total of 8,161 female breast cancer +patients across 15 cohorts originating from seven countries. Of these, 3,502 +patients from five cohorts were used exclusively for evaluation, while the +remaining patients were used for training. Our test accurately predicted our +primary endpoint, disease-free interval, in the five evaluation cohorts +(C-index: 0.71 [0.68-0.75], HR: 3.63 [3.02-4.37, p<0.001]). In a direct +comparison (n=858), the AI test was more accurate than Oncotype DX, the +standard-of-care 21-gene assay, achieving a C-index of 0.67 [0.61-0.74] versus +0.61 [0.49-0.73], respectively. Additionally, the AI test added independent +prognostic information to Oncotype DX in a multivariate analysis (HR: 3.11 +[1.91-5.09, p<0.001)]). The test demonstrated robust accuracy across major +molecular breast cancer subtypes, including TNBC (C-index: 0.71 [0.62-0.81], +HR: 3.81 [2.35-6.17, p=0.02]), where no diagnostic tools are currently +recommended by clinical guidelines. These results suggest that our AI test +improves upon the accuracy of existing prognostic tests, while being applicable +to a wider range of patients. + +
+
+
+
+
+ + ♻ ☆ LLMOPT: Learning to Define and Solve General Optimization Problems from + Scratch + + +
+ Optimization problems are prevalent across various scenarios. Formulating and +then solving optimization problems described by natural language often requires +highly specialized human expertise, which could block the widespread +application of optimization-based decision making. To automate problem +formulation and solving, leveraging large language models (LLMs) has emerged as +a potential way. However, this kind of approach suffers from the issue of +optimization generalization. Namely, the accuracy of most current LLM-based +methods and the generality of optimization problem types that they can model +are still limited. In this paper, we propose a unified learning-based framework +called LLMOPT to boost optimization generalization. Starting from the natural +language descriptions of optimization problems and a pre-trained LLM, LLMOPT +constructs the introduced five-element formulation as a universal model for +learning to define diverse optimization problem types. Then, LLMOPT employs the +multi-instruction tuning to enhance both problem formalization and solver code +generation accuracy and generality. After that, to prevent hallucinations in +LLMs, such as sacrificing solving accuracy to avoid execution errors, the model +alignment and self-correction mechanism are adopted in LLMOPT. We evaluate the +optimization generalization ability of LLMOPT and compared methods across six +real-world datasets covering roughly 20 fields such as health, environment, +energy and manufacturing, etc. Extensive experiment results show that LLMOPT is +able to model various optimization problem types such as linear/nonlinear +programming, mixed integer programming, and combinatorial optimization, and +achieves a notable 11.08% average solving accuracy improvement compared with +the state-of-the-art methods. The code is available at +https://github.com/caigaojiang/LLMOPT. + +
+
+
+
+
+ + ♻ ☆ GAMED-Snake: Gradient-aware Adaptive Momentum Evolution Deep Snake Model + for Multi-organ Segmentation + + +
+ Multi-organ segmentation is a critical yet challenging task due to complex +anatomical backgrounds, blurred boundaries, and diverse morphologies. This +study introduces the Gradient-aware Adaptive Momentum Evolution Deep Snake +(GAMED-Snake) model, which establishes a novel paradigm for contour-based +segmentation by integrating gradient-based learning with adaptive momentum +evolution mechanisms. The GAMED-Snake model incorporates three major +innovations: First, the Distance Energy Map Prior (DEMP) generates a +pixel-level force field that effectively attracts contour points towards the +true boundaries, even in scenarios with complex backgrounds and blurred edges. +Second, the Differential Convolution Inception Module (DCIM) precisely extracts +comprehensive energy gradients, significantly enhancing segmentation accuracy. +Third, the Adaptive Momentum Evolution Mechanism (AMEM) employs cross-attention +to establish dynamic features across different iterations of evolution, +enabling precise boundary alignment for diverse morphologies. Experimental +results on four challenging multi-organ segmentation datasets demonstrate that +GAMED-Snake improves the mDice metric by approximately 2% compared to +state-of-the-art methods. Code will be available at +https://github.com/SYSUzrc/GAMED-Snake. + +
+
+
+
+
+ + ♻ ☆ On the Generalization and Adaptation Ability of Machine-Generated Text + Detectors in Academic Writing + + +
+ The rising popularity of large language models (LLMs) has raised concerns +about machine-generated text (MGT), particularly in academic settings, where +issues like plagiarism and misinformation are prevalent. As a result, +developing a highly generalizable and adaptable MGT detection system has become +an urgent priority. Given that LLMs are most commonly misused in academic +writing, this work investigates the generalization and adaptation capabilities +of MGT detectors in three key aspects specific to academic writing: First, we +construct MGT-Acedemic, a large-scale dataset comprising over 336M tokens and +749K samples. MGT-Acedemic focuses on academic writing, featuring human-written +texts (HWTs) and MGTs across STEM, Humanities, and Social Sciences, paired with +an extensible code framework for efficient benchmarking. Second, we benchmark +the performance of various detectors for binary classification and attribution +tasks in both in-domain and cross-domain settings. This benchmark reveals the +often-overlooked challenges of attribution tasks. Third, we introduce a novel +attribution task where models have to adapt to new classes over time without +(or with very limited) access to prior training data in both few-shot and +many-shot scenarios. We implement eight different adapting techniques to +improve the performance and highlight the inherent complexity of the task. Our +findings provide insights into the generalization and adaptation ability of MGT +detectors across diverse scenarios and lay the foundation for building robust, +adaptive detection systems. The code framework is available at +https://github.com/Y-L-LIU/MGTBench-2.0. + +
+
+
+
+
+ + ♻ ☆ HORAE: A Domain-Agnostic Modeling Language for Automating Multimodal + Service Regulation + + +
+ Artificial intelligence is rapidly encroaching on the field of service +regulation. This work-in-progress article presents the design principles behind +HORAE, a unified specification language to model multimodal regulation rules +across a diverse set of domains. We show how HORAE facilitates an intelligent +service regulation pipeline by further exploiting a fine-tuned large language +model named HORAE that automates the HORAE modeling process, thereby yielding +an end-to-end framework for fully automated intelligent service regulation. + +
+
+
+
+
+ + ♻ ☆ Controllable Context Sensitivity and the Knob Behind It ICLR 2025 + + +
+ When making predictions, a language model must trade off how much it relies +on its context vs. its prior knowledge. Choosing how sensitive the model is to +its context is a fundamental functionality, as it enables the model to excel at +tasks like retrieval-augmented generation and question-answering. In this +paper, we search for a knob which controls this sensitivity, determining +whether language models answer from the context or their prior knowledge. To +guide this search, we design a task for controllable context sensitivity. In +this task, we first feed the model a context (Paris is in England) and a +question (Where is Paris?); we then instruct the model to either use its prior +or contextual knowledge and evaluate whether it generates the correct answer +for both intents (either France or England). When fine-tuned on this task, +instruction-tuned versions of Llama-3.1, Mistral-v0.3, and Gemma-2 can solve it +with high accuracy (85-95%). Analyzing these high-performing models, we narrow +down which layers may be important to context sensitivity using a novel linear +time algorithm. Then, in each model, we identify a 1-D subspace in a single +layer that encodes whether the model follows context or prior knowledge. +Interestingly, while we identify this subspace in a fine-tuned model, we find +that the exact same subspace serves as an effective knob in not only that model +but also non-fine-tuned instruct and base models of that model family. Finally, +we show a strong correlation between a model's performance and how distinctly +it separates context-agreeing from context-ignoring answers in this subspace. +These results suggest a single subspace facilitates how the model chooses +between context and prior knowledge, hinting at a simple fundamental mechanism +that controls this behavior. + +
+
+ comment: Published as a conference paper at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ A Pilot Empirical Study on When and How to Use Knowledge Graphs as + Retrieval Augmented Generation + + +
+ The integration of Knowledge Graphs (KGs) into the Retrieval Augmented +Generation (RAG) framework has attracted significant interest, with early +studies showing promise in mitigating hallucinations and improving model +accuracy. However, a systematic understanding and comparative analysis of the +rapidly emerging KG-RAG methods are still lacking. This paper seeks to lay the +foundation for systematically answering the question of when and how to use +KG-RAG by analyzing their performance in various application scenarios +associated with different technical configurations. After outlining the mind +map using KG-RAG framework and summarizing its popular pipeline, we conduct a +pilot empirical study of KG-RAG works to reimplement and evaluate 6 KG-RAG +methods across 7 datasets in diverse scenarios, analyzing the impact of 9 +KG-RAG configurations in combination with 17 LLMs. Our results underscore the +critical role of appropriate application conditions and optimal configurations +of KG-RAG components. + +
+
+ comment: 8 pages, 2 figures, 14 tables +
+
+
+
+
+ + ♻ ☆ Scaling Offline Model-Based RL via Jointly-Optimized World-Action Model + Pretraining ICLR 2025 + + +
+ A significant aspiration of offline reinforcement learning (RL) is to develop +a generalist agent with high capabilities from large and heterogeneous +datasets. However, prior approaches that scale offline RL either rely heavily +on expert trajectories or struggle to generalize to diverse unseen tasks. +Inspired by the excellent generalization of world model in conditional video +generation, we explore the potential of image observation-based world model for +scaling offline RL and enhancing generalization on novel tasks. In this paper, +we introduce JOWA: Jointly-Optimized World-Action model, an offline model-based +RL agent pretrained on multiple Atari games with 6 billion tokens data to learn +general-purpose representation and decision-making ability. Our method jointly +optimizes a world-action model through a shared transformer backbone, which +stabilize temporal difference learning with large models during pretraining. +Moreover, we propose a provably efficient and parallelizable planning algorithm +to compensate for the Q-value estimation error and thus search out better +policies. Experimental results indicate that our largest agent, with 150 +million parameters, achieves 78.9% human-level performance on pretrained games +using only 10% subsampled offline data, outperforming existing state-of-the-art +large-scale offline RL baselines by 31.6% on averange. Furthermore, JOWA scales +favorably with model capacity and can sample-efficiently transfer to novel +games using only 5k offline fine-tuning data (approximately 4 trajectories) per +game, demonstrating superior generalization. We will release codes and model +weights at https://github.com/CJReinforce/JOWA + +
+
+ comment: Accepted by ICLR 2025 +
+
+
+
+
+ + ♻ ☆ PhyMPGN: Physics-encoded Message Passing Graph Network for + spatiotemporal PDE systems + + +
+ Solving partial differential equations (PDEs) serves as a cornerstone for +modeling complex dynamical systems. Recent progresses have demonstrated grand +benefits of data-driven neural-based models for predicting spatiotemporal +dynamics (e.g., tremendous speedup gain compared with classical numerical +methods). However, most existing neural models rely on rich training data, have +limited extrapolation and generalization abilities, and suffer to produce +precise or reliable physical prediction under intricate conditions (e.g., +irregular mesh or geometry, complex boundary conditions, diverse PDE +parameters, etc.). To this end, we propose a new graph learning approach, +namely, Physics-encoded Message Passing Graph Network (PhyMPGN), to model +spatiotemporal PDE systems on irregular meshes given small training datasets. +Specifically, we incorporate a GNN into a numerical integrator to approximate +the temporal marching of spatiotemporal dynamics for a given PDE system. +Considering that many physical phenomena are governed by diffusion processes, +we further design a learnable Laplace block, which encodes the discrete +Laplace-Beltrami operator, to aid and guide the GNN learning in a physically +feasible solution space. A boundary condition padding strategy is also designed +to improve the model convergence and accuracy. Extensive experiments +demonstrate that PhyMPGN is capable of accurately predicting various types of +spatiotemporal dynamics on coarse unstructured meshes, consistently achieves +the state-of-the-art results, and outperforms other baselines with considerable +gains. + +
+
+
+
+
+ + ♻ ☆ A Closer Look at Machine Unlearning for Large Language Models ICLR 2025 + + +
+ Large language models (LLMs) may memorize sensitive or copyrighted content, +raising privacy and legal concerns. Due to the high cost of retraining from +scratch, researchers attempt to employ machine unlearning to remove specific +content from LLMs while preserving the overall performance. In this paper, we +discuss several issues in machine unlearning for LLMs and provide our insights +on possible approaches. To address the issue of inadequate evaluation of model +outputs after unlearning, we introduce three additional metrics to evaluate +token diversity, sentence semantics, and factual correctness. We then +categorize unlearning methods into untargeted and targeted, and discuss their +issues respectively. Specifically, the behavior that untargeted unlearning +attempts to approximate is unpredictable and may involve hallucinations, and +existing regularization is insufficient for targeted unlearning. To alleviate +these issues, we propose using the objective of maximizing entropy (ME) for +untargeted unlearning and incorporate answer preservation (AP) loss as +regularization for targeted unlearning. Experimental results across three +scenarios, i.e., fictitious unlearning, continual unlearning, and real-world +unlearning, demonstrate the effectiveness of our approaches. The code is +available at https://github.com/sail-sg/closer-look-LLM-unlearning. + +
+
+ comment: ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Scalable Decision-Making in Stochastic Environments through Learned + Temporal Abstraction ICLR2025 + + +
+ Sequential decision-making in high-dimensional continuous action spaces, +particularly in stochastic environments, faces significant computational +challenges. We explore this challenge in the traditional offline RL setting, +where an agent must learn how to make decisions based on data collected through +a stochastic behavior policy. We present Latent Macro Action Planner (L-MAP), +which addresses this challenge by learning a set of temporally extended +macro-actions through a state-conditional Vector Quantized Variational +Autoencoder (VQ-VAE), effectively reducing action dimensionality. L-MAP employs +a (separate) learned prior model that acts as a latent transition model and +allows efficient sampling of plausible actions. During planning, our approach +accounts for stochasticity in both the environment and the behavior policy by +using Monte Carlo tree search (MCTS). In offline RL settings, including +stochastic continuous control tasks, L-MAP efficiently searches over discrete +latent actions to yield high expected returns. Empirical results demonstrate +that L-MAP maintains low decision latency despite increased action +dimensionality. Notably, across tasks ranging from continuous control with +inherently stochastic dynamics to high-dimensional robotic hand manipulation, +L-MAP significantly outperforms existing model-based methods and performs +on-par with strong model-free actor-critic baselines, highlighting the +effectiveness of the proposed approach in planning in complex and stochastic +environments with high-dimensional action spaces. + +
+
+ comment: Accepted by ICLR2025. Code would be available at + https://github.com/BaitingLuo/L-MAP.git +
+
+
+
+
+ + ♻ ☆ AdEval: Alignment-based Dynamic Evaluation to Mitigate Data + Contamination in Large Language Models + + +
+ As Large Language Models (LLMs) are pretrained on massive-scale corpora, the +issue of data contamination has become increasingly severe, leading to +potential overestimation of model performance during evaluation. To address +this, we propose AdEval (Alignment-based Dynamic Evaluation), a dynamic data +evaluation method aimed at mitigating the impact of data contamination on +evaluation reliability. Experimental results on multiple datasets demonstrate +that AdEval effectively reduces the impact of data contamination on evaluation +outcomes, enhancing both the fairness and reliability of the evaluation +process. + +
+
+ comment: There are serious academic problems in this paper, such as data + falsification and plagiarism in the method of the paper +
+
+
+
+
+ + ♻ ☆ Towards Generalizable Scene Change Detection CVPR 2025 + + +
+ While current state-of-the-art Scene Change Detection (SCD) approaches +achieve impressive results in well-trained research data, they become +unreliable under unseen environments and different temporal conditions; +in-domain performance drops from 77.6\% to 8.0\% in a previously unseen +environment and to 4.6\% under a different temporal condition -- calling for +generalizable SCD and benchmark. In this work, we propose the Generalizable +Scene Change Detection Framework (GeSCF), which addresses unseen domain +performance and temporal consistency -- to meet the growing demand for anything +SCD. Our method leverages the pre-trained Segment Anything Model (SAM) in a +zero-shot manner. For this, we design Initial Pseudo-mask Generation and +Geometric-Semantic Mask Matching -- seamlessly turning user-guided prompt and +single-image based segmentation into scene change detection for a pair of +inputs without guidance. Furthermore, we define the Generalizable Scene Change +Detection (GeSCD) benchmark along with novel metrics and an evaluation protocol +to facilitate SCD research in generalizability. In the process, we introduce +the ChangeVPR dataset, a collection of challenging image pairs with diverse +environmental scenarios -- including urban, suburban, and rural settings. +Extensive experiments across various datasets demonstrate that GeSCF achieves +an average performance gain of 19.2\% on existing SCD datasets and 30.0\% on +the ChangeVPR dataset, nearly doubling the prior art performance. We believe +our work can lay a solid foundation for robust and generalizable SCD research. + +
+
+ comment: Manuscript. Accepted to CVPR 2025 +
+
+
+
+
+ + ♻ ☆ OLMoE: Open Mixture-of-Experts Language Models + + +
+ We introduce OLMoE, a fully open, state-of-the-art language model leveraging +sparse Mixture-of-Experts (MoE). OLMoE-1B-7B has 7 billion (B) parameters but +uses only 1B per input token. We pretrain it on 5 trillion tokens and further +adapt it to create OLMoE-1B-7B-Instruct. Our models outperform all available +models with similar active parameters, even surpassing larger ones like +Llama2-13B-Chat and DeepSeekMoE-16B. We present various experiments on MoE +training, analyze routing in our model showing high specialization, and +open-source all aspects of our work: model weights, training data, code, and +logs. + +
+
+ comment: 63 pages (24 main), 36 figures, 17 tables +
+
+
+
+
+ + ♻ ☆ BECAUSE: Bilinear Causal Representation for Generalizable Offline + Model-based Reinforcement Learning + + +
+ Offline model-based reinforcement learning (MBRL) enhances data efficiency by +utilizing pre-collected datasets to learn models and policies, especially in +scenarios where exploration is costly or infeasible. Nevertheless, its +performance often suffers from the objective mismatch between model and policy +learning, resulting in inferior performance despite accurate model predictions. +This paper first identifies the primary source of this mismatch comes from the +underlying confounders present in offline data for MBRL. Subsequently, we +introduce \textbf{B}ilin\textbf{E}ar \textbf{CAUS}al +r\textbf{E}presentation~(BECAUSE), an algorithm to capture causal +representation for both states and actions to reduce the influence of the +distribution shift, thus mitigating the objective mismatch problem. +Comprehensive evaluations on 18 tasks that vary in data quality and environment +context demonstrate the superior performance of BECAUSE over existing offline +RL algorithms. We show the generalizability and robustness of BECAUSE under +fewer samples or larger numbers of confounders. Additionally, we offer +theoretical analysis of BECAUSE to prove its error bound and sample efficiency +when integrating causal representation into offline MBRL. + +
+
+
+
+
+ + ♻ ☆ The Labyrinth of Links: Navigating the Associative Maze of Multi-modal + LLMs ICLR 2025 + + +
+ Multi-modal Large Language Models (MLLMs) have exhibited impressive +capability. However, recently many deficiencies of MLLMs have been found +compared to human intelligence, $\textit{e.g.}$, hallucination. To drive the +MLLMs study, the community dedicated efforts to building larger benchmarks with +complex tasks. In this paper, we propose benchmarking an essential but usually +overlooked intelligence: $\textbf{association}$, a human's basic capability to +link observation and prior practice memory. To comprehensively investigate +MLLM's performance on the association, we formulate the association task and +devise a standard benchmark based on adjective and verb semantic concepts. +Instead of costly data annotation and curation, we propose a convenient +$\textbf{annotation-free}$ construction method transforming the general dataset +for our association tasks. Simultaneously, we devise a rigorous data refinement +process to eliminate confusion in the raw dataset. Building on this database, +we establish three levels of association tasks: single-step, synchronous, and +asynchronous associations. Moreover, we conduct a comprehensive investigation +into the MLLMs' zero-shot association capabilities, addressing multiple +dimensions, including three distinct memory strategies, both open-source and +closed-source MLLMs, cutting-edge Mixture-of-Experts (MoE) models, and the +involvement of human experts. Our systematic investigation shows that current +open-source MLLMs consistently exhibit poor capability in our association +tasks, even the currently state-of-the-art GPT-4V(vision) also has a +significant gap compared to humans. We believe our benchmark would pave the way +for future MLLM studies. $\textit{Our data and code are available at:}$ +https://mvig-rhos.com/llm_inception. + +
+
+ comment: Accepted by ICLR 2025. Project page: + https://mvig-rhos.com/llm_inception +
+
+
+
+
+ + ♻ ☆ NL2FOL: Translating Natural Language to First-Order Logic for Logical + Fallacy Detection + + +
+ Translating natural language into formal language such as First-Order Logic +(FOL) is a foundational challenge in NLP with wide-ranging applications in +automated reasoning, misinformation tracking, and knowledge validation. In this +paper, we introduce Natural Language to First-Order Logic (NL2FOL), a framework +to autoformalize natural language to FOL step by step using Large Language +Models (LLMs). Our approach addresses key challenges in this translation +process, including the integration of implicit background knowledge. By +leveraging structured representations generated by NL2FOL, we use +Satisfiability Modulo Theory (SMT) solvers to reason about the logical validity +of natural language statements. We present logical fallacy detection as a case +study to evaluate the efficacy of NL2FOL. Being neurosymbolic, our approach +also provides interpretable insights into the reasoning process and +demonstrates robustness without requiring model fine-tuning or labeled training +data. Our framework achieves strong performance on multiple datasets. On the +LOGIC dataset, NL2FOL achieves an F1-score of 78%, while generalizing +effectively to the LOGICCLIMATE dataset with an F1-score of 80%. + +
+
+
+
+
+ + ♻ ☆ Performance Review on LLM for solving leetcode problems + + +
+ This paper presents a comprehensive performance evaluation of Large Language +Models (LLMs) in solving programming challenges from Leetcode, a widely used +platform for algorithm practice and technical interviews. We began by crawling +the Leetcode website to collect a diverse set of problems encompassing various +difficulty levels and topics. Using this dataset, we generated solutions with +multiple LLMs, including GPT-4 and GPT-3.5-turbo (ChatGPT-turbo). The generated +solutions were systematically evaluated for correctness and efficiency. We +employed the pass@k metric to assess the success rates within a given number of +attempts and analyzed the runtime performance of the solutions. Our results +highlight the strengths and limitations of current LLMs [10] in code generation +and problem-solving tasks, providing insights into their potential applications +and areas for improvement in automated programming assistance. + +
+
+
+
+
+
+
+
+ + Genomics 6 + +
+
+
+ + ☆ Fungal Genetic Variants in Oceanic Environments + + +
+ Comparing specific types of organisms as they are found across environmental +conditions has helped inform how genes and gene products of these organisms +relate to phenotypes and adaptation. In this study, we examine +metatranscriptomic data as found for oceanic fungi across different oceanic +sampling sites. A specific set of three genes was chosen for evaluation based +on conserved orthology, known association with core physiological processes in +fungi, and level of abundance within oceanic metatranscriptomic data. We report +upon a potential association of genetic variance with environmental conditions +of iron, salt and phosphate in oceanic waters based on heatmap visualization +and PERMANOVA analysis. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ☆ Primer C-VAE: An interpretable deep learning primer design method to + detect emerging virus variants + + +
+ Motivation: PCR is more economical and quicker than Next Generation +Sequencing for detecting target organisms, with primer design being a critical +step. In epidemiology with rapidly mutating viruses, designing effective +primers is challenging. Traditional methods require substantial manual +intervention and struggle to ensure effective primer design across different +strains. For organisms with large, similar genomes like Escherichia coli and +Shigella flexneri, differentiating between species is also difficult but +crucial. + Results: We developed Primer C-VAE, a model based on a Variational +Auto-Encoder framework with Convolutional Neural Networks to identify variants +and generate specific primers. Using SARS-CoV-2, our model classified variants +(alpha, beta, gamma, delta, omicron) with 98% accuracy and generated +variant-specific primers. These primers appeared with >95% frequency in target +variants and <5% in others, showing good performance in in-silico PCR tests. +For Alpha, Delta, and Omicron, our primer pairs produced fragments <200 bp, +suitable for qPCR detection. The model also generated effective primers for +organisms with longer gene sequences like E. coli and S. flexneri. + Conclusion: Primer C-VAE is an interpretable deep learning approach for +developing specific primer pairs for target organisms. This flexible, +semi-automated and reliable tool works regardless of sequence completeness and +length, allowing for qPCR applications and can be applied to organisms with +large and highly similar genomes. + +
+
+
+
+
+ + ♻ ☆ MLOmics: Benchmark for Machine Learning on Cancer Multi-Omics Data + + +
+ Framing the investigation of diverse cancers as a machine learning problem +has recently shown significant potential in multi-omics analysis and cancer +research. Empowering these successful machine learning models are the +high-quality training datasets with sufficient data volume and adequate +preprocessing. However, while there exist several public data portals including +The Cancer Genome Atlas (TCGA) multi-omics initiative or open-bases such as the +LinkedOmics, these databases are not off-the-shelf for existing machine +learning models. In this paper we propose MLOmics, an open cancer multi-omics +benchmark aiming at serving better the development and evaluation of +bioinformatics and machine learning models. MLOmics contains 8,314 patient +samples covering all 32 cancer types with four omics types, stratified +features, and extensive baselines. Complementary support for downstream +analysis and bio-knowledge linking are also included to support +interdisciplinary analysis. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Penalized Principal Component Analysis Using Smoothing + + +
+ Principal components computed via PCA (principal component analysis) are +traditionally used to reduce dimensionality in genomic data or to correct for +population stratification. In this paper, we explore the penalized eigenvalue +problem (PEP) which reformulates the computation of the first eigenvector as an +optimization problem and adds an $L_1$ penalty constraint to enforce sparseness +of the solution. The contribution of our article is threefold. First, we extend +PEP by applying smoothing to the original LASSO-type $L_1$ penalty. This allows +one to compute analytical gradients which enable faster and more efficient +minimization of the objective function associated with the optimization +problem. Second, we demonstrate how higher order eigenvectors can be calculated +with PEP using established results from singular value decomposition (SVD). +Third, we present four experimental studies to demonstrate the usefulness of +the smoothed penalized eigenvectors. Using data from the 1000 Genomes Project +dataset, we empirically demonstrate that our proposed smoothed PEP allows one +to increase numerical stability and obtain meaningful eigenvectors. We also +employ the penalized eigenvector approach in two additional real data +applications (computation of a polygenic risk score and clustering), +demonstrating that exchanging the penalized eigenvectors for their smoothed +counterparts can increase prediction accuracy in polygenic risk scores and +enhance discernibility of clusterings. Moreover, we compare our proposed +smoothed PEP to seven state-of-the-art algorithms for sparse PCA and evaluate +the accuracy of the obtained eigenvectors, their support recovery, and their +runtime. + +
+
+
+
+
+ + ♻ ☆ Whole Genome Transformer for Gene Interaction Effects in Microbiome + Habitat Specificity AAAI 2025 + + +
+ Leveraging the vast genetic diversity within microbiomes offers unparalleled +insights into complex phenotypes, yet the task of accurately predicting and +understanding such traits from genomic data remains challenging. We propose a +framework taking advantage of existing large models for gene vectorization to +predict habitat specificity from entire microbial genome sequences. Based on +our model, we develop attribution techniques to elucidate gene interaction +effects that drive microbial adaptation to diverse environments. We train and +validate our approach on a large dataset of high quality microbiome genomes +from different habitats. We not only demonstrate solid predictive performance, +but also how sequence-level information of entire genomes allows us to identify +gene associations underlying complex phenotypes. Our attribution recovers known +important interaction networks and proposes new candidates for experimental +follow up. + +
+
+ comment: published at AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Multi-Modal and Multi-Attribute Generation of Single Cells with CFGen + + +
+ Generative modeling of single-cell RNA-seq data is crucial for tasks like +trajectory inference, batch effect removal, and simulation of realistic +cellular data. However, recent deep generative models simulating synthetic +single cells from noise operate on pre-processed continuous gene expression +approximations, overlooking the discrete nature of single-cell data, which +limits their effectiveness and hinders the incorporation of robust noise +models. Additionally, aspects like controllable multi-modal and multi-label +generation of cellular data remain underexplored. This work introduces CellFlow +for Generation (CFGen), a flow-based conditional generative model that +preserves the inherent discreteness of single-cell data. CFGen generates +whole-genome multi-modal single-cell data reliably, improving the recovery of +crucial biological data characteristics while tackling relevant generative +tasks such as rare cell type augmentation and batch correction. We also +introduce a novel framework for compositional data generation using Flow +Matching. By showcasing CFGen on a diverse set of biological datasets and +settings, we provide evidence of its value to the fields of computational +biology and deep generative models. + +
+
+ comment: 41 pages, 22 figures +
+
+
+
+
+
+
+
+ + Machine Learning 86 + +
+
+
+ + ♻ ☆ Langevin Multiplicative Weights Update with Applications in Polynomial + Portfolio Management AAAI-2025 + + +
+ We consider nonconvex optimization problem over simplex, and more generally, +a product of simplices. We provide an algorithm, Langevin Multiplicative +Weights Update (LMWU) for solving global optimization problems by adding a +noise scaling with the non-Euclidean geometry in the simplex. Non-convex +optimization has been extensively studied by machine learning community due to +its application in various scenarios such as neural network approximation and +finding Nash equilibrium. Despite recent progresses on provable guarantee of +escaping and avoiding saddle point (convergence to local minima) and global +convergence of Langevin gradient based method without constraints, the global +optimization with constraints is less studied. We show that LMWU algorithm is +provably convergent to interior global minima with a non-asymptotic convergence +analysis. We verify the efficiency of the proposed algorithm in real data set +from polynomial portfolio management, where optimization of a highly non-linear +objective function plays a crucial role. + +
+
+ comment: Accepted for AAAI-2025 +
+
+
+
+
+ + ♻ ☆ Revisiting the Test-Time Scaling of o1-like Models: Do they Truly + Possess Test-Time Scaling Capabilities? + + +
+ The advent of test-time scaling in large language models (LLMs), exemplified +by OpenAI's o1 series, has advanced reasoning capabilities by scaling +computational resource allocation during inference. While successors like QwQ, +Deepseek-R1 (R1) and LIMO replicate these advancements, whether these models +truly possess test-time scaling capabilities remains underexplored. This study +found that longer CoTs of these o1-like models do not consistently enhance +accuracy; in fact, correct solutions are often shorter than incorrect ones for +the same questions. Further investigation shows this phenomenon is closely +related to models' self-revision capabilities - longer CoTs contain more +self-revisions, which often lead to performance degradation. We then compare +sequential and parallel scaling strategies on QwQ, R1 and LIMO, finding that +parallel scaling achieves better coverage and scalability. Based on these +insights, we propose Shortest Majority Vote, a method that combines parallel +scaling strategies with CoT length characteristics, significantly improving +models' test-time scalability compared to conventional majority voting +approaches. + +
+
+ comment: Add the github link +
+
+
+
+
+ + ♻ ☆ Gradient-Based Multi-Objective Deep Learning: Algorithms, Theories, + Applications, and Beyond + + +
+ Multi-objective optimization (MOO) in deep learning aims to simultaneously +optimize multiple conflicting objectives, a challenge frequently encountered in +areas like multi-task learning and multi-criteria learning. Recent advancements +in gradient-based MOO methods have enabled the discovery of diverse types of +solutions, ranging from a single balanced solution to finite or even infinite +Pareto sets, tailored to user needs. These developments have broad applications +across domains such as reinforcement learning, computer vision, recommendation +systems, and large language models. This survey provides the first +comprehensive review of gradient-based MOO in deep learning, covering +algorithms, theories, and practical applications. By unifying various +approaches and identifying critical challenges, it serves as a foundational +resource for driving innovation in this evolving field. A comprehensive list of +MOO algorithms in deep learning is available at +https://github.com/Baijiong-Lin/Awesome-Multi-Objective-Deep-Learning. + +
+
+
+
+
+ + ♻ ☆ Preconditioned Inexact Stochastic ADMM for Deep Model + + +
+ The recent advancement of foundation models (FMs) has brought about a +paradigm shift, revolutionizing various sectors worldwide. The popular +optimizers used to train these models are stochastic gradient descent-based +algorithms, which face inherent limitations, such as slow convergence and +stringent assumptions for convergence. In particular, data heterogeneity +arising from distributed settings poses significant challenges to their +theoretical and numerical performance. This paper develops an algorithm, PISA +({P}reconditioned {I}nexact {S}tochastic {A}lternating Direction Method of +Multipliers), which enables scalable parallel computing and supports various +second-moment schemes. Grounded in rigorous theoretical guarantees, the +algorithm converges under the sole assumption of Lipschitz continuity of the +gradient, thereby removing the need for other conditions commonly imposed by +stochastic methods. This capability enables PISA to tackle the challenge of +data heterogeneity effectively. Comprehensive experimental evaluations for +training or fine-tuning diverse FMs, including vision models, large language +models, reinforcement learning models, generative adversarial networks, and +recurrent neural networks, demonstrate its superior numerical performance +compared to various state-of-the-art optimizers. + +
+
+
+
+
+ + ♻ ☆ Kinetix: Investigating the Training of General Agents through Open-Ended + Physics-Based Control Tasks ICLR 2025 + + +
+ While large models trained with self-supervised learning on offline datasets +have shown remarkable capabilities in text and image domains, achieving the +same generalisation for agents that act in sequential decision problems remains +an open challenge. In this work, we take a step towards this goal by +procedurally generating tens of millions of 2D physics-based tasks and using +these to train a general reinforcement learning (RL) agent for physical +control. To this end, we introduce Kinetix: an open-ended space of +physics-based RL environments that can represent tasks ranging from robotic +locomotion and grasping to video games and classic RL environments, all within +a unified framework. Kinetix makes use of our novel hardware-accelerated +physics engine Jax2D that allows us to cheaply simulate billions of environment +steps during training. Our trained agent exhibits strong physical reasoning +capabilities in 2D space, being able to zero-shot solve unseen human-designed +environments. Furthermore, fine-tuning this general agent on tasks of interest +shows significantly stronger performance than training an RL agent *tabula +rasa*. This includes solving some environments that standard RL training +completely fails at. We believe this demonstrates the feasibility of large +scale, mixed-quality pre-training for online RL and we hope that Kinetix will +serve as a useful framework to investigate this further. + +
+
+ comment: ICLR 2025 Oral. The first two authors contributed equally. Project + page located at: https://kinetix-env.github.io/ +
+
+
+
+
+ + ♻ ☆ Optimizing Backward Policies in GFlowNets via Trajectory Likelihood + Maximization ICLR 2025 + + +
+ Generative Flow Networks (GFlowNets) are a family of generative models that +learn to sample objects with probabilities proportional to a given reward +function. The key concept behind GFlowNets is the use of two stochastic +policies: a forward policy, which incrementally constructs compositional +objects, and a backward policy, which sequentially deconstructs them. Recent +results show a close relationship between GFlowNet training and +entropy-regularized reinforcement learning (RL) problems with a particular +reward design. However, this connection applies only in the setting of a fixed +backward policy, which might be a significant limitation. As a remedy to this +problem, we introduce a simple backward policy optimization algorithm that +involves direct maximization of the value function in an entropy-regularized +Markov Decision Process (MDP) over intermediate rewards. We provide an +extensive experimental evaluation of the proposed approach across various +benchmarks in combination with both RL and GFlowNet algorithms and demonstrate +its faster convergence and mode discovery in complex environments. + +
+
+ comment: ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Optimize Incompatible Parameters through Compatibility-aware Knowledge + Integration AAAI'25 + + +
+ Deep neural networks have become foundational to advancements in multiple +domains, including recommendation systems, natural language processing, and so +on. Despite their successes, these models often contain incompatible parameters +that can be underutilized or detrimental to model performance, particularly +when faced with specific, varying data distributions. Existing research excels +in removing such parameters or merging the outputs of multiple different +pretrained models. However, the former focuses on efficiency rather than +performance, while the latter requires several times more computing and storage +resources to support inference. In this paper, we set the goal to explicitly +improve these incompatible parameters by leveraging the complementary strengths +of different models, thereby directly enhancing the models without any +additional parameters. Specifically, we propose Compatibility-aware Knowledge +Integration (CKI), which consists of Parameter Compatibility Assessment and +Parameter Splicing, which are used to evaluate the knowledge content of +multiple models and integrate the knowledge into one model, respectively. The +integrated model can be used directly for inference or for further fine-tuning. +We conduct extensive experiments on various datasets for recommendation and +language tasks, and the results show that Compatibility-aware Knowledge +Integration can effectively optimize incompatible parameters under multiple +tasks and settings to break through the training limit of the original model +without increasing the inference cost. + +
+
+ comment: Published on AAAI'25(Oral): The Annual AAAI Conference on Artificial + Intelligence +
+
+
+
+
+ + ♻ ☆ AnyECG: Foundational Models for Multitask Cardiac Analysis in Real-World + Settings + + +
+ Electrocardiogram (ECG), a non-invasive and affordable tool for cardiac +monitoring, is highly sensitive in detecting acute heart attacks. However, due +to the lengthy nature of ECG recordings, numerous machine learning methods have +been developed for automated heart disease detection to reduce human workload. +Despite these efforts, performance remains suboptimal. A key obstacle is the +inherent complexity of ECG data, which includes heterogeneity (e.g., varying +sampling rates), high levels of noise, demographic-related pattern shifts, and +intricate rhythm-event associations. To overcome these challenges, this paper +introduces AnyECG, a foundational model designed to extract robust +representations from any real-world ECG data. Specifically, a tailored ECG +Tokenizer encodes each fixed-duration ECG fragment into a token and, guided by +proxy tasks, converts noisy, continuous ECG features into discrete, compact, +and clinically meaningful local rhythm codes. These codes encapsulate basic +morphological, frequency, and demographic information (e.g., sex), effectively +mitigating signal noise. We further pre-train the AnyECG to learn rhythmic +pattern associations across ECG tokens, enabling the capture of cardiac event +semantics. By being jointly pre-trained on diverse ECG data sources, AnyECG is +capable of generalizing across a wide range of downstream tasks where ECG +signals are recorded from various devices and scenarios. The experimental +results show that AnyECG achieves an average performance improvement of 6% +across four critical tasks-anomaly detection, arrhythmia classification, +corrupted lead generation, and ultra-long ECG recognition. AnyECG learns common +ECG rhythm from data and significantly outperforms state-of-the-art methods in +each of these tasks. + +
+
+
+
+
+ + ♻ ☆ Nonasymptotic Analysis of Stochastic Gradient Descent with the + Richardson-Romberg Extrapolation ICLR-2025 + + +
+ We address the problem of solving strongly convex and smooth minimization +problems using stochastic gradient descent (SGD) algorithm with a constant step +size. Previous works suggested to combine the Polyak-Ruppert averaging +procedure with the Richardson-Romberg extrapolation to reduce the asymptotic +bias of SGD at the expense of a mild increase of the variance. We significantly +extend previous results by providing an expansion of the mean-squared error of +the resulting estimator with respect to the number of iterations $n$. We show +that the root mean-squared error can be decomposed into the sum of two terms: a +leading one of order $\mathcal{O}(n^{-1/2})$ with explicit dependence on a +minimax-optimal asymptotic covariance matrix, and a second-order term of order +$\mathcal{O}(n^{-3/4})$, where the power $3/4$ is best known. We also extend +this result to the higher-order moment bounds. Our analysis relies on the +properties of the SGD iterates viewed as a time-homogeneous Markov chain. In +particular, we establish that this chain is geometrically ergodic with respect +to a suitably defined weighted Wasserstein semimetric. + +
+
+ comment: ICLR-2025, camera-ready version +
+
+
+
+
+ + ♻ ☆ MOOSE-Chem: Large Language Models for Rediscovering Unseen Chemistry + Scientific Hypotheses ICLR 2025 + + +
+ Scientific discovery contributes largely to human society's prosperity, and +recent progress shows that LLMs could potentially catalyze this process. +However, it is still unclear whether LLMs can discover novel and valid +hypotheses in chemistry. In this work, we investigate this central research +question: Can LLMs automatically discover novel and valid chemistry research +hypotheses given only a chemistry research background (consisting of a research +question and/or a background survey), without limitation on the domain of the +research question? After extensive discussions with chemistry experts, we +propose an assumption that a majority of chemistry hypotheses can be resulted +from a research background and several inspirations. With this key insight, we +break the central question into three smaller fundamental questions. In brief, +they are: (1) given a background question, whether LLMs can retrieve good +inspirations; (2) with background and inspirations, whether LLMs can lead to +hypothesis; and (3) whether LLMs can identify good hypotheses to rank them +higher. To investigate these questions, we construct a benchmark consisting of +51 chemistry papers published in Nature, Science, or a similar level in 2024 +(all papers are only available online since 2024). Every paper is divided by +chemistry PhD students into three components: background, inspirations, and +hypothesis. The goal is to rediscover the hypothesis, given only the background +and a large randomly selected chemistry literature corpus consisting the ground +truth inspiration papers, with LLMs trained with data up to 2023. We also +develop an LLM-based multi-agent framework that leverages the assumption, +consisting of three stages reflecting the three smaller questions. The proposed +method can rediscover many hypotheses with very high similarity with the ground +truth ones, covering the main innovations. + +
+
+ comment: Accepted by ICLR 2025 +
+
+
+
+
+ + ♻ ☆ A Meta-Learning Approach to Bayesian Causal Discovery + + +
+ Discovering a unique causal structure is difficult due to both inherent +identifiability issues, and the consequences of finite data. As such, +uncertainty over causal structures, such as those obtained from a Bayesian +posterior, are often necessary for downstream tasks. Finding an accurate +approximation to this posterior is challenging, due to the large number of +possible causal graphs, as well as the difficulty in the subproblem of finding +posteriors over the functional relationships of the causal edges. Recent works +have used meta-learning to view the problem of estimating the maximum +a-posteriori causal graph as supervised learning. Yet, these methods are +limited when estimating the full posterior as they fail to encode key +properties of the posterior, such as correlation between edges and permutation +equivariance with respect to nodes. Further, these methods also cannot reliably +sample from the posterior over causal structures. To address these limitations, +we propose a Bayesian meta learning model that allows for sampling causal +structures from the posterior and encodes these key properties. We compare our +meta-Bayesian causal discovery against existing Bayesian causal discovery +methods, demonstrating the advantages of directly learning a posterior over +causal structure. + +
+
+
+
+
+ + ♻ ☆ Poison-splat: Computation Cost Attack on 3D Gaussian Splatting ICLR 2025 + + +
+ 3D Gaussian splatting (3DGS), known for its groundbreaking performance and +efficiency, has become a dominant 3D representation and brought progress to +many 3D vision tasks. However, in this work, we reveal a significant security +vulnerability that has been largely overlooked in 3DGS: the computation cost of +training 3DGS could be maliciously tampered by poisoning the input data. By +developing an attack named Poison-splat, we reveal a novel attack surface where +the adversary can poison the input images to drastically increase the +computation memory and time needed for 3DGS training, pushing the algorithm +towards its worst computation complexity. In extreme cases, the attack can even +consume all allocable memory, leading to a Denial-of-Service (DoS) that +disrupts servers, resulting in practical damages to real-world 3DGS service +vendors. Such a computation cost attack is achieved by addressing a bi-level +optimization problem through three tailored strategies: attack objective +approximation, proxy model rendering, and optional constrained optimization. +These strategies not only ensure the effectiveness of our attack but also make +it difficult to defend with simple defensive measures. We hope the revelation +of this novel attack surface can spark attention to this crucial yet overlooked +vulnerability of 3DGS systems. Our code is available at +https://github.com/jiahaolu97/poison-splat . + +
+
+ comment: Accepted by ICLR 2025 as a spotlight paper +
+
+
+
+
+ + ♻ ☆ On the Geometry and Optimization of Polynomial Convolutional Networks AISTATS 2025 + + +
+ We study convolutional neural networks with monomial activation functions. +Specifically, we prove that their parameterization map is regular and is an +isomorphism almost everywhere, up to rescaling the filters. By leveraging on +tools from algebraic geometry, we explore the geometric properties of the image +in function space of this map - typically referred to as neuromanifold. In +particular, we compute the dimension and the degree of the neuromanifold, which +measure the expressivity of the model, and describe its singularities. +Moreover, for a generic large dataset, we derive an explicit formula that +quantifies the number of critical points arising in the optimization of a +regression loss. + +
+
+ comment: Accepted at AISTATS 2025 +
+
+
+
+
+ + ♻ ☆ Federated Temporal Graph Clustering + + +
+ Temporal graph clustering is a complex task that involves discovering +meaningful structures in dynamic graphs where relationships and entities change +over time. Existing methods typically require centralized data collection, +which poses significant privacy and communication challenges. In this work, we +introduce a novel Federated Temporal Graph Clustering (FTGC) framework that +enables decentralized training of graph neural networks (GNNs) across multiple +clients, ensuring data privacy throughout the process. Our approach +incorporates a temporal aggregation mechanism to effectively capture the +evolution of graph structures over time and a federated optimization strategy +to collaboratively learn high-quality clustering representations. By preserving +data privacy and reducing communication overhead, our framework achieves +competitive performance on temporal graph datasets, making it a promising +solution for privacy-sensitive, real-world applications involving dynamic data. + +
+
+ comment: 8 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ MLOmics: Benchmark for Machine Learning on Cancer Multi-Omics Data + + +
+ Framing the investigation of diverse cancers as a machine learning problem +has recently shown significant potential in multi-omics analysis and cancer +research. Empowering these successful machine learning models are the +high-quality training datasets with sufficient data volume and adequate +preprocessing. However, while there exist several public data portals including +The Cancer Genome Atlas (TCGA) multi-omics initiative or open-bases such as the +LinkedOmics, these databases are not off-the-shelf for existing machine +learning models. In this paper we propose MLOmics, an open cancer multi-omics +benchmark aiming at serving better the development and evaluation of +bioinformatics and machine learning models. MLOmics contains 8,314 patient +samples covering all 32 cancer types with four omics types, stratified +features, and extensive baselines. Complementary support for downstream +analysis and bio-knowledge linking are also included to support +interdisciplinary analysis. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Optimal Brain Apoptosis ICLR 2025 + + +
+ The increasing complexity and parameter count of Convolutional Neural +Networks (CNNs) and Transformers pose challenges in terms of computational +efficiency and resource demands. Pruning has been identified as an effective +strategy to address these challenges by removing redundant elements such as +neurons, channels, or connections, thereby enhancing computational efficiency +without heavily compromising performance. This paper builds on the foundational +work of Optimal Brain Damage (OBD) by advancing the methodology of parameter +importance estimation using the Hessian matrix. Unlike previous approaches that +rely on approximations, we introduce Optimal Brain Apoptosis (OBA), a novel +pruning method that calculates the Hessian-vector product value directly for +each parameter. By decomposing the Hessian matrix across network layers and +identifying conditions under which inter-layer Hessian submatrices are +non-zero, we propose a highly efficient technique for computing the +second-order Taylor expansion of parameters. This approach allows for a more +precise pruning process, particularly in the context of CNNs and Transformers, +as validated in our experiments including VGG19, ResNet32, ResNet50, and +ViT-B/16 on CIFAR10, CIFAR100 and Imagenet datasets. Our code is available at +https://github.com/NEU-REAL/OBA. + +
+
+ comment: Accepted to ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Exploring the Effectiveness of Object-Centric Representations in Visual + Question Answering: Comparative Insights with Foundation Models ICLR 2025 + + +
+ Object-centric (OC) representations, which model visual scenes as +compositions of discrete objects, have the potential to be used in various +downstream tasks to achieve systematic compositional generalization and +facilitate reasoning. However, these claims have yet to be thoroughly validated +empirically. Recently, foundation models have demonstrated unparalleled +capabilities across diverse domains, from language to computer vision, +positioning them as a potential cornerstone of future research for a wide range +of computational tasks. In this paper, we conduct an extensive empirical study +on representation learning for downstream Visual Question Answering (VQA), +which requires an accurate compositional understanding of the scene. We +thoroughly investigate the benefits and trade-offs of OC models and alternative +approaches including large pre-trained foundation models on both synthetic and +real-world data, ultimately identifying a promising path to leverage the +strengths of both paradigms. The extensiveness of our study, encompassing over +600 downstream VQA models and 15 different types of upstream representations, +also provides several additional insights that we believe will be of interest +to the community at large. + +
+
+ comment: Published at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Adaptive $Q$-Network: On-the-fly Target Selection for Deep Reinforcement + Learning ICLR + + +
+ Deep Reinforcement Learning (RL) is well known for being highly sensitive to +hyperparameters, requiring practitioners substantial efforts to optimize them +for the problem at hand. This also limits the applicability of RL in real-world +scenarios. In recent years, the field of automated Reinforcement Learning +(AutoRL) has grown in popularity by trying to address this issue. However, +these approaches typically hinge on additional samples to select +well-performing hyperparameters, hindering sample-efficiency and practicality. +Furthermore, most AutoRL methods are heavily based on already existing AutoML +methods, which were originally developed neglecting the additional challenges +inherent to RL due to its non-stationarities. In this work, we propose a new +approach for AutoRL, called Adaptive $Q$-Network (AdaQN), that is tailored to +RL to take into account the non-stationarity of the optimization procedure +without requiring additional samples. AdaQN learns several $Q$-functions, each +one trained with different hyperparameters, which are updated online using the +$Q$-function with the smallest approximation error as a shared target. Our +selection scheme simultaneously handles different hyperparameters while coping +with the non-stationarity induced by the RL optimization procedure and being +orthogonal to any critic-based RL algorithm. We demonstrate that AdaQN is +theoretically sound and empirically validate it in MuJoCo control problems and +Atari $2600$ games, showing benefits in sample-efficiency, overall performance, +robustness to stochasticity and training stability. + +
+
+ comment: Accepted at ICLR https://iclr.cc/virtual/2025/poster/28508 +
+
+
+
+
+ + ♻ ☆ Offline Model-Based Optimization by Learning to Rank ICLR 2025 + + +
+ Offline model-based optimization (MBO) aims to identify a design that +maximizes a black-box function using only a fixed, pre-collected dataset of +designs and their corresponding scores. A common approach in offline MBO is to +train a regression-based surrogate model by minimizing mean squared error (MSE) +and then find the best design within this surrogate model by different +optimizers (e.g., gradient ascent). However, a critical challenge is the risk +of out-of-distribution errors, i.e., the surrogate model may typically +overestimate the scores and mislead the optimizers into suboptimal regions. +Prior works have attempted to address this issue in various ways, such as using +regularization techniques and ensemble learning to enhance the robustness of +the model, but it still remains. In this paper, we argue that regression models +trained with MSE are not well-aligned with the primary goal of offline MBO, +which is to select promising designs rather than to predict their scores +precisely. Notably, if a surrogate model can maintain the order of candidate +designs based on their relative score relationships, it can produce the best +designs even without precise predictions. To validate it, we conduct +experiments to compare the relationship between the quality of the final +designs and MSE, finding that the correlation is really very weak. In contrast, +a metric that measures order-maintaining quality shows a significantly stronger +correlation. Based on this observation, we propose learning a ranking-based +model that leverages learning to rank techniques to prioritize promising +designs based on their relative scores. We show that the generalization error +on ranking loss can be well bounded. Empirical results across diverse tasks +demonstrate the superior performance of our proposed ranking-based models than +twenty existing methods. + +
+
+ comment: ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Variational Best-of-N Alignment + + +
+ Best-of-N (BoN) is a popular and effective algorithm for aligning language +models to human preferences. The algorithm works as follows: at inference time, +N samples are drawn from the language model, and the sample with the highest +reward, as judged by a reward model, is returned as the output. Despite its +effectiveness, BoN is computationally expensive; it reduces sampling throughput +by a factor of N. To make BoN more efficient at inference time, one strategy is +to fine-tune the language model to mimic what BoN does during inference. To +achieve this, we derive the distribution induced by the BoN algorithm. We then +propose to fine-tune the language model to minimize backward KL divergence to +the BoN distribution. Our approach is analogous to mean-field variational +inference and, thus, we term it variational BoN (vBoN). To the extent this +fine-tuning is successful and we end up with a good approximation, we have +reduced the inference cost by a factor of N. Our experiments on controlled +generation and summarization tasks show that BoN is the most effective +alignment method, and our variational approximation to BoN achieves the closest +performance to BoN and surpasses models fine-tuned using the standard +KL-constrained RL objective. In the controlled generation task, vBoN appears +more frequently on the Pareto frontier of reward and KL divergence compared to +other alignment methods. In the summarization task, vBoN achieves high reward +values across various sampling temperatures. + +
+
+
+
+
+ + ♻ ☆ FLEXtime: Filterbank learning to explain time series + + +
+ State-of-the-art methods for explaining predictions from time series involve +learning an instance-wise saliency mask for each time step; however, many types +of time series are difficult to interpret in the time domain, due to the +inherently complex nature of the data. Instead, we propose to view time series +explainability as saliency maps over interpretable parts, leaning on +established signal processing methodology on signal decomposition. +Specifically, we propose a new method called FLEXtime that uses a bank of +bandpass filters to split the time series into frequency bands. Then, we learn +the combination of these bands that optimally explains the model's prediction. +Our extensive evaluation shows that, on average, FLEXtime outperforms +state-of-the-art explainability methods across a range of datasets. FLEXtime +fills an important gap in the current time series explainability methodology +and is a valuable tool for a wide range of time series such as EEG and audio. +Code will be made available at https://github.com/theabrusch/FLEXtime. + +
+
+
+
+
+ + ♻ ☆ Adaptive Prompt: Unlocking the Power of Visual Prompt Tuning + + +
+ Visual Prompt Tuning (VPT) has recently emerged as a powerful method for +adapting pre-trained vision models to downstream tasks. By introducing +learnable prompt tokens as task-specific instructions, VPT effectively guides +pre-trained transformer models with minimal overhead. Despite its empirical +success, a comprehensive theoretical understanding of VPT remains an active +area of research. Building on recent insights into the connection between +mixture of experts and prompt-based approaches, we identify a key limitation in +VPT: the restricted functional expressiveness in prompt formulation. To address +this limitation, we propose Visual Adaptive Prompt Tuning (VAPT), a new +generation of prompts that redefines prompts as adaptive functions of the +input. Our theoretical analysis shows that this simple yet intuitive approach +achieves optimal sample efficiency. Empirical results on VTAB-1K and FGVC +further demonstrate VAPT's effectiveness, with performance gains of 7.34% and +1.04% over fully fine-tuning baselines, respectively. Notably, VAPT also +surpasses VPT by a substantial margin while using fewer parameters. These +results highlight both the effectiveness and efficiency of our method and pave +the way for future research to explore the potential of adaptive prompts. + +
+
+ comment: 57 pages, 10 figures, 18 tables +
+
+
+
+
+ + ♻ ☆ HOPE: A Reinforcement Learning-based Hybrid Policy Path Planner for + Diverse Parking Scenarios + + +
+ Automated parking stands as a highly anticipated application of autonomous +driving technology. However, existing path planning methodologies fall short of +addressing this need due to their incapability to handle the diverse and +complex parking scenarios in reality. While non-learning methods provide +reliable planning results, they are vulnerable to intricate occasions, whereas +learning-based ones are good at exploration but unstable in converging to +feasible solutions. To leverage the strengths of both approaches, we introduce +Hybrid pOlicy Path plannEr (HOPE). This novel solution integrates a +reinforcement learning agent with Reeds-Shepp curves, enabling effective +planning across diverse scenarios. HOPE guides the exploration of the +reinforcement learning agent by applying an action mask mechanism and employs a +transformer to integrate the perceived environmental information with the mask. +To facilitate the training and evaluation of the proposed planner, we propose a +criterion for categorizing the difficulty level of parking scenarios based on +space and obstacle distribution. Experimental results demonstrate that our +approach outperforms typical rule-based algorithms and traditional +reinforcement learning methods, showing higher planning success rates and +generalization across various scenarios. We also conduct real-world experiments +to verify the practicability of HOPE. The code for our solution is openly +available on https://github.com/jiamiya/HOPE. + +
+
+ comment: Accepted by T-ITS. 11 pages, 5 tables, 6 figures, 2 page appendix +
+
+
+
+
+ + ♻ ☆ PnP-Flow: Plug-and-Play Image Restoration with Flow Matching + + +
+ In this paper, we introduce Plug-and-Play (PnP) Flow Matching, an algorithm +for solving imaging inverse problems. PnP methods leverage the strength of +pre-trained denoisers, often deep neural networks, by integrating them in +optimization schemes. While they achieve state-of-the-art performance on +various inverse problems in imaging, PnP approaches face inherent limitations +on more generative tasks like inpainting. On the other hand, generative models +such as Flow Matching pushed the boundary in image sampling yet lack a clear +method for efficient use in image restoration. We propose to combine the PnP +framework with Flow Matching (FM) by defining a time-dependent denoiser using a +pre-trained FM model. Our algorithm alternates between gradient descent steps +on the data-fidelity term, reprojections onto the learned FM path, and +denoising. Notably, our method is computationally efficient and +memory-friendly, as it avoids backpropagation through ODEs and trace +computations. We evaluate its performance on denoising, super-resolution, +deblurring, and inpainting tasks, demonstrating superior results compared to +existing PnP algorithms and Flow Matching based state-of-the-art methods. + +
+
+
+
+
+ + ♻ Meta Curvature-Aware Minimization for Domain Generalization + + +
+ Domain generalization (DG) aims to enhance the ability of models trained on +source domains to generalize effectively to unseen domains. Recently, +Sharpness-Aware Minimization (SAM) has shown promise in this area by reducing +the sharpness of the loss landscape to obtain more generalized models. However, +SAM and its variants sometimes fail to guide the model toward a flat minimum, +and their training processes exhibit limitations, hindering further +improvements in model generalization. In this paper, we first propose an +improved model training process aimed at encouraging the model to converge to a +flat minima. To achieve this, we design a curvature metric that has a minimal +effect when the model is far from convergence but becomes increasingly +influential in indicating the curvature of the minima as the model approaches a +local minimum. Then we derive a novel algorithm from this metric, called Meta +Curvature-Aware Minimization (MeCAM), to minimize the curvature around the +local minima. Specifically, the optimization objective of MeCAM simultaneously +minimizes the regular training loss, the surrogate gap of SAM, and the +surrogate gap of meta-learning. We provide theoretical analysis on MeCAM's +generalization error and convergence rate, and demonstrate its superiority over +existing DG methods through extensive experiments on five benchmark DG +datasets, including PACS, VLCS, OfficeHome, TerraIncognita, and DomainNet. Code +will be available on GitHub. + +
+
+ comment: 22 pages, 5 figures, 17 tables +
+
+
+
+
+ + ♻ ☆ Towards Training One-Step Diffusion Models Without Distillation + + +
+ Recent advances in one-step generative models typically follow a two-stage +process: first training a teacher diffusion model and then distilling it into a +one-step student model. This distillation process traditionally relies on both +the teacher model's score function to compute the distillation loss and its +weights for student initialization. In this paper, we explore whether one-step +generative models can be trained directly without this distillation process. +First, we show that the teacher's score function is not essential and propose a +family of distillation methods that achieve competitive results without relying +on score estimation. Next, we demonstrate that initialization from teacher +weights is indispensable in successful training. Surprisingly, we find that +this benefit is not due to improved ``input-output" mapping but rather the +learned feature representations, which dominate distillation quality. Our +findings provide a better understanding of the role of initialization in +one-step model training and its impact on distillation quality. + +
+
+ comment: 13 pages, Technical Report +
+
+
+
+
+ + ♻ ☆ TAG: A Decentralized Framework for Multi-Agent Hierarchical + Reinforcement Learning + + +
+ Hierarchical organization is fundamental to biological systems and human +societies, yet artificial intelligence systems often rely on monolithic +architectures that limit adaptability and scalability. Current hierarchical +reinforcement learning (HRL) approaches typically restrict hierarchies to two +levels or require centralized training, which limits their practical +applicability. We introduce TAME Agent Framework (TAG), a framework for +constructing fully decentralized hierarchical multi-agent systems.TAG enables +hierarchies of arbitrary depth through a novel LevelEnv concept, which +abstracts each hierarchy level as the environment for the agents above it. This +approach standardizes information flow between levels while preserving loose +coupling, allowing for seamless integration of diverse agent types. We +demonstrate the effectiveness of TAG by implementing hierarchical architectures +that combine different RL agents across multiple levels, achieving improved +performance over classical multi-agent RL baselines on standard benchmarks. Our +results show that decentralized hierarchical organization enhances both +learning speed and final performance, positioning TAG as a promising direction +for scalable multi-agent systems. + +
+
+
+
+
+ + ♻ ☆ Slowing Down Forgetting in Continual Learning + + +
+ A common challenge in continual learning (CL) is catastrophic forgetting, +where the performance on old tasks drops after new, additional tasks are +learned. In this paper, we propose a novel framework called ReCL to slow down +forgetting in CL. Our framework exploits an implicit bias of gradient-based +neural networks due to which these converge to margin maximization points. Such +convergence points allow us to reconstruct old data from previous tasks, which +we then combine with the current training data. Our framework is flexible and +can be applied on top of existing, state-of-the-art CL methods. We further +demonstrate the performance gain from our framework across a large series of +experiments, including two challenging CL scenarios (class incremental and +domain incremental learning), different datasets (MNIST, CIFAR10, +TinyImagenet), and different network architectures. Across all experiments, we +find large performance gains through ReCL. To the best of our knowledge, our +framework is the first to address catastrophic forgetting by leveraging models +in CL as their own memory buffers. + +
+
+
+
+
+ + ♻ ☆ Causality Is Key to Understand and Balance Multiple Goals in Trustworthy + ML and Foundation Models + + +
+ Ensuring trustworthiness in machine learning (ML) systems is crucial as they +become increasingly embedded in high-stakes domains. This paper advocates for +integrating causal methods into machine learning to navigate the trade-offs +among key principles of trustworthy ML, including fairness, privacy, +robustness, accuracy, and explainability. While these objectives should ideally +be satisfied simultaneously, they are often addressed in isolation, leading to +conflicts and suboptimal solutions. Drawing on existing applications of +causality in ML that successfully align goals such as fairness and accuracy or +privacy and robustness, this paper argues that a causal approach is essential +for balancing multiple competing objectives in both trustworthy ML and +foundation models. Beyond highlighting these trade-offs, we examine how +causality can be practically integrated into ML and foundation models, offering +solutions to enhance their reliability and interpretability. Finally, we +discuss the challenges, limitations, and opportunities in adopting causal +frameworks, paving the way for more accountable and ethically sound AI systems. + +
+
+
+
+
+ + ♻ ☆ MANTRA: The Manifold Triangulations Assemblage ICLR 2025 + + +
+ The rising interest in leveraging higher-order interactions present in +complex systems has led to a surge in more expressive models exploiting +higher-order structures in the data, especially in topological deep learning +(TDL), which designs neural networks on higher-order domains such as simplicial +complexes. However, progress in this field is hindered by the scarcity of +datasets for benchmarking these architectures. To address this gap, we +introduce MANTRA, the first large-scale, diverse, and intrinsically +higher-order dataset for benchmarking higher-order models, comprising over +43,000 and 250,000 triangulations of surfaces and three-dimensional manifolds, +respectively. With MANTRA, we assess several graph- and simplicial +complex-based models on three topological classification tasks. We demonstrate +that while simplicial complex-based neural networks generally outperform their +graph-based counterparts in capturing simple topological invariants, they also +struggle, suggesting a rethink of TDL. Thus, MANTRA serves as a benchmark for +assessing and advancing topological methods, leading the way for more effective +higher-order models. + +
+
+ comment: Accepted at ICLR 2025 (https://openreview.net/forum?id=X6y5CC44HM) +
+
+
+
+
+ + ♻ ☆ Attacking Large Language Models with Projected Gradient Descent + + +
+ Current LLM alignment methods are readily broken through specifically crafted +adversarial prompts. While crafting adversarial prompts using discrete +optimization is highly effective, such attacks typically use more than 100,000 +LLM calls. This high computational cost makes them unsuitable for, e.g., +quantitative analyses and adversarial training. To remedy this, we revisit +Projected Gradient Descent (PGD) on the continuously relaxed input prompt. +Although previous attempts with ordinary gradient-based attacks largely failed, +we show that carefully controlling the error introduced by the continuous +relaxation tremendously boosts their efficacy. Our PGD for LLMs is up to one +order of magnitude faster than state-of-the-art discrete optimization to +achieve the same devastating attack results. + +
+
+
+
+
+ + ♻ ☆ EXACFS -- A CIL Method to mitigate Catastrophic Forgetting + + +
+ Deep neural networks (DNNS) excel at learning from static datasets but +struggle with continual learning, where data arrives sequentially. Catastrophic +forgetting, the phenomenon of forgetting previously learned knowledge, is a +primary challenge. This paper introduces EXponentially Averaged Class-wise +Feature Significance (EXACFS) to mitigate this issue in the class incremental +learning (CIL) setting. By estimating the significance of model features for +each learned class using loss gradients, gradually aging the significance +through the incremental tasks and preserving the significant features through a +distillation loss, EXACFS effectively balances remembering old knowledge +(stability) and learning new knowledge (plasticity). Extensive experiments on +CIFAR-100 and ImageNet-100 demonstrate EXACFS's superior performance in +preserving stability while acquiring plasticity. + +
+
+
+
+
+ + ♻ ☆ Exact Certification of (Graph) Neural Networks Against Label Poisoning ICLR 2025 + + +
+ Machine learning models are highly vulnerable to label flipping, i.e., the +adversarial modification (poisoning) of training labels to compromise +performance. Thus, deriving robustness certificates is important to guarantee +that test predictions remain unaffected and to understand worst-case robustness +behavior. However, for Graph Neural Networks (GNNs), the problem of certifying +label flipping has so far been unsolved. We change this by introducing an exact +certification method, deriving both sample-wise and collective certificates. +Our method leverages the Neural Tangent Kernel (NTK) to capture the training +dynamics of wide networks enabling us to reformulate the bilevel optimization +problem representing label flipping into a Mixed-Integer Linear Program (MILP). +We apply our method to certify a broad range of GNN architectures in node +classification tasks. Thereby, concerning the worst-case robustness to label +flipping: $(i)$ we establish hierarchies of GNNs on different benchmark graphs; +$(ii)$ quantify the effect of architectural choices such as activations, depth +and skip-connections; and surprisingly, $(iii)$ uncover a novel phenomenon of +the robustness plateauing for intermediate perturbation budgets across all +investigated datasets and architectures. While we focus on GNNs, our +certificates are applicable to sufficiently wide NNs in general through their +NTK. Thus, our work presents the first exact certificate to a poisoning attack +ever derived for neural networks, which could be of independent interest. The +code is available at https://github.com/saper0/qpcert. + +
+
+ comment: Published as a spotlight presentation at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ ReFocus: Reinforcing Mid-Frequency and Key-Frequency Modeling for + Multivariate Time Series Forecasting + + +
+ Recent advancements have progressively incorporated frequency-based +techniques into deep learning models, leading to notable improvements in +accuracy and efficiency for time series analysis tasks. However, the +Mid-Frequency Spectrum Gap in the real-world time series, where the energy is +concentrated at the low-frequency region while the middle-frequency band is +negligible, hinders the ability of existing deep learning models to extract the +crucial frequency information. Additionally, the shared Key-Frequency in +multivariate time series, where different time series share indistinguishable +frequency patterns, is rarely exploited by existing literature. This work +introduces a novel module, Adaptive Mid-Frequency Energy Optimizer, based on +convolution and residual learning, to emphasize the significance of +mid-frequency bands. We also propose an Energy-based Key-Frequency Picking +Block to capture shared Key-Frequency, which achieves superior inter-series +modeling performance with fewer parameters. A novel Key-Frequency Enhanced +Training strategy is employed to further enhance Key-Frequency modeling, where +spectral information from other channels is randomly introduced into each +channel. Our approach advanced multivariate time series forecasting on the +challenging Traffic, ECL, and Solar benchmarks, reducing MSE by 4%, 6%, and 5% +compared to the previous SOTA iTransformer. Code is available at this GitHub +Repository: https://github.com/Levi-Ackman/ReFocus. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ Deep Learning-Driven Malware Classification with API Call Sequence + Analysis and Concept Drift Handling + + +
+ Malware classification in dynamic environments presents a significant +challenge due to concept drift, where the statistical properties of malware +data evolve over time, complicating detection efforts. To address this issue, +we propose a deep learning framework enhanced with a genetic algorithm to +improve malware classification accuracy and adaptability. Our approach +incorporates mutation operations and fitness score evaluations within genetic +algorithms to continuously refine the deep learning model, ensuring robustness +against evolving malware threats. Experimental results demonstrate that this +hybrid method significantly enhances classification performance and +adaptability, outperforming traditional static models. Our proposed approach +offers a promising solution for real-time malware classification in +ever-changing cybersecurity landscapes. + +
+
+
+
+
+ + ♻ ☆ Q-Adapter: Customizing Pre-trained LLMs to New Preferences with + Forgetting Mitigation ICLR 2025 + + +
+ Large Language Models (LLMs), trained on a large amount of corpus, have +demonstrated remarkable abilities. However, it may not be sufficient to +directly apply open-source LLMs like Llama to certain real-world scenarios, +since most of them are trained for \emph{general} purposes. Thus, the demands +for customizing publicly available LLMs emerge, but are currently +under-studied. In this work, we consider customizing pre-trained LLMs with new +human preferences. Specifically, the LLM should not only meet the new +preference but also preserve its original capabilities after customization. +Drawing inspiration from the observation that human preference can be expressed +as a reward model, we propose to cast LLM customization as optimizing the sum +of two reward functions, one of which (denoted as $r_1$) was used to pre-train +the LLM while the other (denoted as $r_2$) characterizes the new human +preference. The obstacle here is that both reward functions are unknown, making +the application of modern reinforcement learning methods infeasible. Thanks to +the residual Q-learning framework, we can restore the customized LLM with the +pre-trained LLM and the \emph{residual Q-function} without the reward function +$r_1$. Moreover, we find that for a fixed pre-trained LLM, the reward function +$r_2$ can be derived from the residual Q-function, enabling us to directly +learn the residual Q-function from the new human preference data upon the +Bradley-Terry model. We name our method Q-Adapter as it introduces an adapter +module to approximate the residual Q-function for customizing the pre-trained +LLM towards the new preference. Experiments based on the Llama-3.1 model on the +DSP dataset and HH-RLHF dataset illustrate the superior effectiveness of +Q-Adapter on both retaining existing knowledge and learning new preferences. +Code is available at https://github.com/mansicer/Q-Adapter. + +
+
+ comment: Camera ready version of ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Scintillation pulse characterization with spectrum-inspired temporal + neural networks: case studies on particle detector signals + + +
+ Particle detectors based on scintillators are widely used in high-energy +physics and astroparticle physics experiments, nuclear medicine imaging, +industrial and environmental detection, etc. Precisely extracting scintillation +signal characteristics at the event level is important for these applications, +not only in respect of understanding the scintillator itself, but also kinds +and physical property of incident particles. Recent researches demonstrate +data-driven neural networks surpass traditional statistical methods, especially +when the analytical form of signals is hard to obtain, or noise is significant. +However, most densely connected or convolution-based networks fail to fully +exploit the spectral and temporal structure of scintillation signals, leaving +large space for performance improvement. In this paper, we propose a network +architecture specially tailored for scintillation pulse characterization based +on previous works on time series analysis. The core insight is that, by +directly applying Fast Fourier Transform on original signals and utilizing +different frequency components, the proposed network architecture can serve as +a lightweight and enhanced representation learning backbone. We prove our idea +in two case studies: (a) simulation data generated with the setting of the LUX +dark matter detector, and (b) experimental electrical signals with fast +electronics to emulate scintillation variations for the NICA/MPD calorimeter. +The proposed model achieves significantly better results than the reference +model in literature and densely connected models, and demonstrates higher +cost-efficiency than conventional machine learning methods. + +
+
+ comment: 29 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ SURGE: On the Potential of Large Language Models as General-Purpose + Surrogate Code Executors + + +
+ Neural surrogate models have emerged as powerful and efficient tools in data +mining. Meanwhile, large language models (LLMs) have demonstrated remarkable +capabilities in code-related tasks. We investigate a novel application: using +LLMs as surrogate models for code execution prediction. Given LLMs' unique +ability to understand and process diverse programs, they present a promising +direction for building general-purpose surrogate models. To systematically +investigate this capability, we introduce SURGE, a comprehensive benchmark with +$1160$ problems covering $8$ key aspects: multi-language programming tasks, +competition-level programming problems, repository-level code analysis, +high-cost scientific computing, time-complexity-intensive algorithms, buggy +code analysis, programs dependent on specific compilers or execution +environments, and formal mathematical proof verification. Through extensive +empirical analysis of $21$ open-source and proprietary LLMs, we examine scaling +laws, data efficiency, and predictive accuracy. Our findings reveal important +insights about the feasibility of LLMs as efficient surrogates for +computational processes, with implications for automated software testing, +program analysis, and computational resource optimization in data mining +applications. Code and dataset are released at +https://github.com/Imbernoulli/SURGE. + +
+
+
+
+
+ + ♻ ☆ Robust Preference Optimization through Reward Model Distillation + + +
+ Language model (LM) post-training (or alignment) involves maximizing a reward +function that is derived from preference annotations. Direct Preference +Optimization (DPO) is a popular offline alignment method that trains a policy +directly on preference data without the need to train a reward model or apply +reinforcement learning. However, the empirical evidence suggests that DPO +typically assigns implicit rewards that overfit, and trend towards infinite +magnitude. This frequently leads to degenerate policies, sometimes causing even +the probabilities of the preferred generations to go to zero. In this work, we +analyze this phenomenon and use distillation to get a better proxy for the true +preference distribution over generation pairs: we train the LM such that its +induced implicit reward, i.e., the scaled log-likelihood ratio of the model to +the reference model, matches an explicit reward model trained on the preference +data. Moreover, to account for uncertainty in the reward model we are +distilling from, we optimize against a family of reward models that, as a +whole, is likely to include at least one reasonable proxy for the preference +distribution. Our results show that distilling from such a family of reward +models leads to improved robustness to distribution shift in preference +annotations, while preserving the simple supervised nature of DPO. + +
+
+
+
+
+ + ♻ ☆ Towards Graph Foundation Models: A Study on the Generalization of + Positional and Structural Encodings + + +
+ Recent advances in integrating positional and structural encodings (PSEs) +into graph neural networks (GNNs) have significantly enhanced their performance +across various graph learning tasks. However, the general applicability of +these encodings and their potential to serve as foundational representations +for graphs remain uncertain. This paper investigates the fine-tuning +efficiency, scalability with sample size, and generalization capability of +learnable PSEs across diverse graph datasets. Specifically, we evaluate their +potential as universal pre-trained models that can be easily adapted to new +tasks with minimal fine-tuning and limited data. Furthermore, we assess the +expressivity of the learned representations, particularly, when used to augment +downstream GNNs. We demonstrate through extensive benchmarking and empirical +analysis that PSEs generally enhance downstream models. However, some datasets +may require specific PSE-augmentations to achieve optimal performance. +Nevertheless, our findings highlight their significant potential to become +integral components of future graph foundation models. We provide new insights +into the strengths and limitations of PSEs, contributing to the broader +discourse on foundation models in graph learning. + +
+
+ comment: Published at TMLR (https://openreview.net/forum?id=mSoDRZXsqj) +
+
+
+
+
+ + ♻ ☆ DailyDilemmas: Revealing Value Preferences of LLMs with Quandaries of + Daily Life ICLR 2025 + + +
+ As users increasingly seek guidance from LLMs for decision-making in daily +life, many of these decisions are not clear-cut and depend significantly on the +personal values and ethical standards of people. We present DailyDilemmas, a +dataset of 1,360 moral dilemmas encountered in everyday life. Each dilemma +presents two possible actions, along with affected parties and relevant human +values for each action. Based on these dilemmas, we gather a repository of +human values covering diverse everyday topics, such as interpersonal +relationships, workplace, and environmental issues. With DailyDilemmas, we +evaluate LLMs on these dilemmas to determine what action they will choose and +the values represented by these action choices. Then, we analyze values through +the lens of five theoretical frameworks inspired by sociology, psychology, and +philosophy, including the World Values Survey, Moral Foundations Theory, +Maslow's Hierarchy of Needs, Aristotle's Virtues, and Plutchik's Wheel of +Emotions. For instance, we find LLMs are most aligned with self-expression over +survival in World Values Survey and care over loyalty in Moral Foundations +Theory. Interestingly, we find substantial preference differences in models for +some core values. For example, for truthfulness, Mixtral-8x7B neglects it by +9.7% while GPT-4-turbo selects it by 9.4%. We also study the recent guidance +released by OpenAI (ModelSpec), and Anthropic (Constitutional AI) to understand +how their designated principles reflect their models' actual value +prioritization when facing nuanced moral reasoning in daily-life settings. +Finally, we find that end users cannot effectively steer such prioritization +using system prompts. + +
+
+ comment: Accepted into ICLR 2025 (spotlight) +
+
+
+
+
+ + ♻ ☆ Test-Time Compute: from System-1 Thinking to System-2 Thinking + + +
+ The remarkable performance of the o1 model in complex reasoning demonstrates +that test-time compute scaling can further unlock the model's potential, +enabling powerful System-2 thinking. However, there is still a lack of +comprehensive surveys for test-time compute scaling. We trace the concept of +test-time compute back to System-1 models. In System-1 models, test-time +compute addresses distribution shifts and improves robustness and +generalization through parameter updating, input modification, representation +editing, and output calibration. In System-2 models, it enhances the model's +reasoning ability to solve complex problems through repeated sampling, +self-correction, and tree search. We organize this survey according to the +trend of System-1 to System-2 thinking, highlighting the key role of test-time +compute in the transition from System-1 models to weak System-2 models, and +then to strong System-2 models. We also point out a few possible future +directions. + +
+
+ comment: work in progress +
+
+
+
+
+ + ♻ ☆ SheetAgent: Towards A Generalist Agent for Spreadsheet Reasoning and + Manipulation via Large Language Models WWW + + +
+ Spreadsheets are ubiquitous across the World Wide Web, playing a critical +role in enhancing work efficiency across various domains. Large language model +(LLM) has been recently attempted for automatic spreadsheet manipulation but +has not yet been investigated in complicated and realistic tasks where +reasoning challenges exist (e.g., long horizon manipulation with multi-step +reasoning and ambiguous requirements). To bridge the gap with the real-world +requirements, we introduce SheetRM, a benchmark featuring long-horizon and +multi-category tasks with reasoning-dependent manipulation caused by real-life +challenges. To mitigate the above challenges, we further propose SheetAgent, a +novel autonomous agent that utilizes the power of LLMs. SheetAgent consists of +three collaborative modules: Planner, Informer, and Retriever, achieving both +advanced reasoning and accurate manipulation over spreadsheets without human +interaction through iterative task reasoning and reflection. Extensive +experiments demonstrate that SheetAgent delivers 20--40\% pass rate +improvements on multiple benchmarks over baselines, achieving enhanced +precision in spreadsheet manipulation and demonstrating superior table +reasoning abilities. More details and visualizations are available at the +project website: https://sheetagent.github.io/. The datasets and source code +are available at https://anonymous.4open.science/r/SheetAgent. + +
+
+ comment: Accepted by International World Wide Web Conference (WWW) 2025 (oral) +
+
+
+
+
+ + ♻ ☆ SpikeLLM: Scaling up Spiking Neural Network to Large Language Models via + Saliency-based Spiking + + +
+ Recent advancements in large language models (LLMs) with billions of +parameters have improved performance in various applications, but their +inference processes demand significant energy and computational resources. In +contrast, the human brain, with approximately 86 billion neurons, is much more +energy-efficient than LLMs with similar parameters. Inspired by this, we +redesign 7$\sim$70 billion parameter LLMs using bio-plausible spiking +mechanisms, emulating the efficient behavior of the human brain. We propose the +first spiking large language model, SpikeLLM. Coupled with the proposed model, +two essential approaches are proposed to improve spike training efficiency: +Generalized Integrate-and-Fire (GIF) neurons to compress spike length from $T$ +to $\frac{T}{L} \log_2 L$ bits, and an Optimal Brain Spiking framework to +divide outlier channels and allocate different $T$ for GIF neurons, which +further compresses spike length to approximate $log_2T$ bits. The necessity of +spike-driven LLM is proved by comparison with quantized LLMs with similar +operations. In the OmniQuant pipeline, SpikeLLM reduces 11.01% WikiText2 +perplexity and improves 2.55% accuracy of common scene reasoning on a LLAMA-7B +W4A4 model. In the GPTQ pipeline, SpikeLLM achieves direct additive in linear +layers, significantly exceeding PB-LLMs. + +
+
+
+
+
+ + ♻ ☆ Efficient Learning Under Density Shift in Incremental Settings Using + Cramér-Rao-Based Regularization + + +
+ The continuous surge in data volume and velocity is often dealt with using +data orchestration and distributed processing approaches, abstracting away the +machine learning challenges that exist at the algorithmic level. With growing +interest in automating the learning loop, training with data that arrive in a +sequence rather than in the classical in-memory training data form will face a +machine learning challenge because of evolving feature distributions across +batches of training data biasing the cross-validation step +(\cite{sugiyama2012machine}). This work takes a distributed density estimation +angle to the problem where data are temporally distributed. It processes data +in batches and allows a neural network to treat a batch as training data. The +method accumulates knowledge about the data density via posterior probability +absorption using the Fisher Information Matrix, which contains information +about the local optimization gradients for the batch. This is then used as a +regularizer for the loss in the following batch, and therefore the density +estimate for the entire dataset constructively gets more robust to the non-iid +distribution shift. This needs the presence of a pair of batches in memory at a +time, so the space cost is not a function of the size of the complete, +distributed dataset. We proposed a novel regularization-based approach +Covariate Shift Correction $C^{2}A$ that leverages Fisher information and +Kullback-Leibler divergence to adapt to both natural and sequential covariate +shift caused by dataset fragmentation. $C^{2}A$ achieves $19\%$ accuracy at +maximum against state-of-the-art methods. + +
+
+ comment: It is the older version of our this paper arXiv:2502.15756. So this + is the duplicate older version mistakenly uploaded. There are mistakes in the + method part of this paper +
+
+
+
+
+ + ♻ ☆ Long-Term EEG Partitioning for Seizure Onset Detection AAAI 2025 + + +
+ Deep learning models have recently shown great success in classifying +epileptic patients using EEG recordings. Unfortunately, classification-based +methods lack a sound mechanism to detect the onset of seizure events. In this +work, we propose a two-stage framework, SODor, that explicitly models seizure +onset through a novel task formulation of subsequence clustering. Given an EEG +sequence, the framework first learns a set of second-level embeddings with +label supervision. It then employs model-based clustering to explicitly capture +long-term temporal dependencies in EEG sequences and identify meaningful +subsequences. Epochs within a subsequence share a common cluster assignment +(normal or seizure), with cluster or state transitions representing successful +onset detections. Extensive experiments on three datasets demonstrate that our +method can correct misclassifications, achieving 5\%-11\% classification +improvements over other baselines and accurately detecting seizure onsets. + +
+
+ comment: Accepted at AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Rethinking Channel Dimensions to Isolate Outliers for Low-bit Weight + Quantization of Large Language Models ICLR 2024 + + +
+ Large Language Models (LLMs) have recently demonstrated remarkable success +across various tasks. However, efficiently serving LLMs has been a challenge +due to the large memory bottleneck, specifically in small batch inference +settings (e.g. mobile devices). Weight-only quantization can be a promising +approach, but sub-4 bit quantization remains a challenge due to large-magnitude +activation outliers. To mitigate the undesirable outlier effect, we first +propose per-IC quantization, a simple yet effective method that creates +quantization groups within each input channel (IC) rather than the conventional +per-output-channel (per-OC). Our method is motivated by the observation that +activation outliers affect the input dimension of the weight matrix, so +similarly grouping the weights in the IC direction can isolate outliers within +a group. We also find that activation outliers do not dictate quantization +difficulty, and inherent weight sensitivities also exist. With per-IC +quantization as a new outlier-friendly scheme, we propose Adaptive Dimensions +(AdaDim), a versatile quantization framework that can adapt to various weight +sensitivity patterns. We demonstrate the effectiveness of AdaDim by augmenting +prior methods such as Round-To-Nearest and GPTQ, showing significant +improvements across various language modeling benchmarks for both base (up to ++4.7% on MMLU) and instruction-tuned (up to +10% on HumanEval) LLMs. Code is +available at https://github.com/johnheo/adadim-llm + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Cross-Spectral Vision Transformer for Biometric Authentication using + Forehead Subcutaneous Vein Pattern and Periocular Pattern + + +
+ Traditional biometric systems have encountered significant setbacks due to +various unavoidable factors, for example, face recognition-based biometrics +fails due to the wearing of face masks and fingerprints create hygiene +concerns. This paper proposes a novel lightweight cross-spectral vision +transformer (CS-ViT) for biometric authentication using forehead subcutaneous +vein patterns and periocular patterns, offering a promising alternative to +traditional methods, capable of performing well even with the face masks and +without any physical touch. The proposed framework comprises a cross-spectral +dual-channel architecture designed to handle two distinct biometric traits and +to capture inter-dependencies in terms of relative spectral patterns. Each +channel consists of a Phase-Only Correlation Cross-Spectral Attention (POC-CSA) +that captures their individual as well as correlated patterns. The computation +of cross-spectral attention using POC extracts the phase correlation in the +spatial features. Therefore, it is robust against the resolution/intensity +variations and illumination of the input images, assuming both biometric traits +are from the same person. The lightweight model is suitable for edge device +deployment. The performance of the proposed algorithm was rigorously evaluated +using the Forehead Subcutaneous Vein Pattern and Periocular Biometric Pattern +(FSVP-PBP) database. The results demonstrated the superiority of the algorithm +over state-of-the-art methods, achieving a remarkable classification accuracy +of 98.8% with the combined vein and periocular patterns. + +
+
+ comment: Submitted to IEEE TPAMI +
+
+
+
+
+ + ♻ ☆ Leray-Schauder Mappings for Operator Learning + + +
+ We present an algorithm for learning operators between Banach spaces, based +on the use of Leray-Schauder mappings to learn a finite-dimensional +approximation of compact subspaces. We show that the resulting method is a +universal approximator of (possibly nonlinear) operators. We demonstrate the +efficiency of the approach on two benchmark datasets showing it achieves +results comparable to state of the art models. + +
+
+ comment: 13 pages, 2 figures, 1 table. Comments are welcome! v2: Theoretical + analysis expanded, several explanations regarding the experiments have been + added for improved clarity +
+
+
+
+
+ + ♻ ☆ Representation Engineering: A Top-Down Approach to AI Transparency + + +
+ In this paper, we identify and characterize the emerging area of +representation engineering (RepE), an approach to enhancing the transparency of +AI systems that draws on insights from cognitive neuroscience. RepE places +population-level representations, rather than neurons or circuits, at the +center of analysis, equipping us with novel methods for monitoring and +manipulating high-level cognitive phenomena in deep neural networks (DNNs). We +provide baselines and an initial analysis of RepE techniques, showing that they +offer simple yet effective solutions for improving our understanding and +control of large language models. We showcase how these methods can provide +traction on a wide range of safety-relevant problems, including honesty, +harmlessness, power-seeking, and more, demonstrating the promise of top-down +transparency research. We hope that this work catalyzes further exploration of +RepE and fosters advancements in the transparency and safety of AI systems. + +
+
+ comment: Code is available at + https://github.com/andyzoujm/representation-engineering +
+
+
+
+
+ + ♻ ☆ TokenSelect: Efficient Long-Context Inference and Length Extrapolation + for LLMs via Dynamic Token-Level KV Cache Selection + + +
+ The rapid advancement of Large Language Models (LLMs) has driven growing +demand for processing extended context sequences in contemporary applications. +However, this progress faces two major challenges: performance degradation due +to sequence lengths out-of-distribution, and excessively long inference times +caused by the quadratic computational complexity of attention. These issues +hinder the application of LLMs in long-context scenarios. In this paper, we +propose Dynamic Token-Level KV Cache Selection (TokenSelect), a training-free +method for efficient and accurate long-context inference. TokenSelect builds +upon the observation of non-contiguous attention sparsity, using Query-Key dot +products to measure per-head KV Cache criticality at token-level. By per-head +soft voting mechanism, TokenSelect selectively involves a few critical KV cache +tokens in attention calculation without sacrificing accuracy. To further +accelerate TokenSelect, we design the Selection Cache based on observations of +consecutive Query similarity and implemented efficient dot product kernel, +significantly reducing the overhead. A comprehensive evaluation of TokenSelect +demonstrates up to 23.84x speedup in attention computation and up to 2.28x +acceleration in end-to-end latency, while providing superior performance +compared to state-of-the-art long-context inference methods. + +
+
+
+
+
+ + ♻ ☆ Speed-accuracy relations for the diffusion models: Wisdom from + nonequilibrium thermodynamics and optimal transport + + +
+ We discuss a connection between a generative model, called the diffusion +model, and nonequilibrium thermodynamics for the Fokker-Planck equation, called +stochastic thermodynamics. Based on the techniques of stochastic +thermodynamics, we derive the speed-accuracy relations for the diffusion +models, which are inequalities that relate the accuracy of data generation to +the entropy production rate, which can be interpreted as the speed of the +diffusion dynamics in the absence of the non-conservative force. From a +stochastic thermodynamic perspective, our results provide a quantitative +insight into how best to generate data in diffusion models. The optimal +learning protocol is introduced by the geodesic of space of the 2-Wasserstein +distance in optimal transport theory. We numerically illustrate the validity of +the speed-accuracy relations for the diffusion models with different noise +schedules and the different data. We numerically discuss our results for the +optimal and suboptimal learning protocols. We also show the inaccurate data +generation due to the non-conservative force, and the applicability of our +results to data generation from the real-world image datasets. + +
+
+ comment: 36 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Structural-Entropy-Based Sample Selection for Efficient and Effective + Learning ICLR 2025 + + +
+ Sample selection improves the efficiency and effectiveness of machine +learning models by providing informative and representative samples. Typically, +samples can be modeled as a sample graph, where nodes are samples and edges +represent their similarities. Most existing methods are based on local +information, such as the training difficulty of samples, thereby overlooking +global information, such as connectivity patterns. This oversight can result in +suboptimal selection because global information is crucial for ensuring that +the selected samples well represent the structural properties of the graph. To +address this issue, we employ structural entropy to quantify global information +and losslessly decompose it from the whole graph to individual nodes using the +Shapley value. Based on the decomposition, we present +$\textbf{S}$tructural-$\textbf{E}$ntropy-based sample $\textbf{S}$election +($\textbf{SES}$), a method that integrates both global and local information to +select informative and representative samples. SES begins by constructing a +$k$NN-graph among samples based on their similarities. It then measures sample +importance by combining structural entropy (global metric) with training +difficulty (local metric). Finally, SES applies importance-biased blue noise +sampling to select a set of diverse and representative samples. Comprehensive +experiments on three learning scenarios -- supervised learning, active +learning, and continual learning -- clearly demonstrate the effectiveness of +our method. + +
+
+ comment: Published as a conference paper at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ PATCH: a deep learning method to assess heterogeneity of artistic + practice in historical paintings + + +
+ The history of art has seen significant shifts in the manner in which +artworks are created, making understanding of creative processes a central +question in technical art history. In the Renaissance and Early Modern period, +paintings were largely produced by master painters directing workshops of +apprentices who often contributed to projects. The masters varied significantly +in artistic and managerial styles, meaning different combinations of artists +and implements might be seen both between masters and within workshops or even +individual canvases. Information on how different workshops were managed and +the processes by which artworks were created remains elusive. Machine learning +methods have potential to unearth new information about artists' creative +processes by extending the analysis of brushwork to a microscopic scale. +Analysis of workshop paintings, however, presents a challenge in that +documentation of the artists and materials involved is sparse, meaning external +examples are not available to train networks to recognize their contributions. +Here we present a novel machine learning approach we call pairwise assignment +training for classifying heterogeneity (PATCH) that is capable of identifying +individual artistic practice regimes with no external training data, or "ground +truth." The method achieves unsupervised results by supervised means, and +outperforms both simple statistical procedures and unsupervised machine +learning methods. We apply this method to two historical paintings by the +Spanish Renaissance master, El Greco: The Baptism of Christ and Christ on the +Cross with Landscape, and our findings regarding the former potentially +challenge previous work that has assigned the painting to workshop members. +Further, the results of our analyses create a measure of heterogeneity of +artistic practice that can be used to characterize artworks across time and +space. + +
+
+ comment: main text: 16 pages, 6 figures; SI: 7 pages, 3 figures; v2: minor + typo corrections, higher resolution figures +
+
+
+
+
+ + ♻ ☆ Node-Time Conditional Prompt Learning In Dynamic Graphs ICLR 2025 + + +
+ Dynamic graphs capture evolving interactions between entities, such as in +social networks, online learning platforms, and crowdsourcing projects. For +dynamic graph modeling, dynamic graph neural networks (DGNNs) have emerged as a +mainstream technique. However, they are generally pre-trained on the link +prediction task, leaving a significant gap from the objectives of downstream +tasks such as node classification. To bridge the gap, prompt-based learning has +gained traction on graphs, but most existing efforts focus on static graphs, +neglecting the evolution of dynamic graphs. In this paper, we propose +DYGPROMPT, a novel pre-training and prompt learning framework for dynamic graph +modeling. First, we design dual prompts to address the gap in both task +objectives and temporal variations across pre-training and downstream tasks. +Second, we recognize that node and time features mutually characterize each +other, and propose dual condition-nets to model the evolving node-time patterns +in downstream tasks. Finally, we thoroughly evaluate and analyze DYGPROMPT +through extensive experiments on four public datasets. + +
+
+ comment: Accepted by ICLR 2025 +
+
+
+
+
+ + ♻ ☆ AdvLogo: Adversarial Patch Attack against Object Detectors based on + Diffusion Models + + +
+ With the rapid development of deep learning, object detectors have +demonstrated impressive performance; however, vulnerabilities still exist in +certain scenarios. Current research exploring the vulnerabilities using +adversarial patches often struggles to balance the trade-off between attack +effectiveness and visual quality. To address this problem, we propose a novel +framework of patch attack from semantic perspective, which we refer to as +AdvLogo. Based on the hypothesis that every semantic space contains an +adversarial subspace where images can cause detectors to fail in recognizing +objects, we leverage the semantic understanding of the diffusion denoising +process and drive the process to adversarial subareas by perturbing the latent +and unconditional embeddings at the last timestep. To mitigate the distribution +shift that exposes a negative impact on image quality, we apply perturbation to +the latent in frequency domain with the Fourier Transform. Experimental results +demonstrate that AdvLogo achieves strong attack performance while maintaining +high visual quality. + +
+
+
+
+
+ + ♻ ☆ Generative Representational Instruction Tuning + + +
+ All text-based language problems can be reduced to either generation or +embedding. Current models only perform well at one or the other. We introduce +generative representational instruction tuning (GRIT) whereby a large language +model is trained to handle both generative and embedding tasks by +distinguishing between them through instructions. Compared to other open +models, our resulting GritLM 7B sets a new state of the art on the Massive Text +Embedding Benchmark (MTEB) and outperforms all models up to its size on a range +of generative tasks. By scaling up further, GritLM 8x7B outperforms all open +generative language models that we tried while still being among the best +embedding models. Notably, we find that GRIT matches training on only +generative or embedding data, thus we can unify both at no performance loss. +Among other benefits, the unification via GRIT speeds up Retrieval-Augmented +Generation (RAG) by > 60% for long documents, by no longer requiring separate +retrieval and generation models. Models, code, etc. are freely available at +https://github.com/ContextualAI/gritlm. + +
+
+ comment: 67 pages (16 main), 25 figures, 34 tables +
+
+
+
+
+ + ♻ ☆ Calib3D: Calibrating Model Preferences for Reliable 3D Scene + Understanding WACV 2025 + + +
+ Safety-critical 3D scene understanding tasks necessitate not only accurate +but also confident predictions from 3D perception models. This study introduces +Calib3D, a pioneering effort to benchmark and scrutinize the reliability of 3D +scene understanding models from an uncertainty estimation viewpoint. We +comprehensively evaluate 28 state-of-the-art models across 10 diverse 3D +datasets, uncovering insightful phenomena that cope with both the aleatoric and +epistemic uncertainties in 3D scene understanding. We discover that despite +achieving impressive levels of accuracy, existing models frequently fail to +provide reliable uncertainty estimates -- a pitfall that critically undermines +their applicability in safety-sensitive contexts. Through extensive analysis of +key factors such as network capacity, LiDAR representations, rasterization +resolutions, and 3D data augmentation techniques, we correlate these aspects +directly with the model calibration efficacy. Furthermore, we introduce DeptS, +a novel depth-aware scaling approach aimed at enhancing 3D model calibration. +Extensive experiments across a wide range of configurations validate the +superiority of our method. We hope this work could serve as a cornerstone for +fostering reliable 3D scene understanding. Code and benchmark toolkit are +publicly available. + +
+
+ comment: WACV 2025 Oral; 26 pages, 8 figures, 12 tables; Code at + https://github.com/ldkong1205/Calib3D +
+
+
+
+
+ + ♻ ☆ Federated Learning in Practice: Reflections and Projections + + +
+ Federated Learning (FL) is a machine learning technique that enables multiple +entities to collaboratively learn a shared model without exchanging their local +data. Over the past decade, FL systems have achieved substantial progress, +scaling to millions of devices across various learning domains while offering +meaningful differential privacy (DP) guarantees. Production systems from +organizations like Google, Apple, and Meta demonstrate the real-world +applicability of FL. However, key challenges remain, including verifying +server-side DP guarantees and coordinating training across heterogeneous +devices, limiting broader adoption. Additionally, emerging trends such as large +(multi-modal) models and blurred lines between training, inference, and +personalization challenge traditional FL frameworks. In response, we propose a +redefined FL framework that prioritizes privacy principles rather than rigid +definitions. We also chart a path forward by leveraging trusted execution +environments and open-source ecosystems to address these challenges and +facilitate future advancements in FL. + +
+
+ comment: Published at 2024 IEEE 6th International Conference on Trust, Privacy + and Security in Intelligent Systems, and Applications (TPS-ISA) +
+
+
+
+
+ + ♻ ☆ Compositional simulation-based inference for time series ICLR 2025 + + +
+ Amortized simulation-based inference (SBI) methods train neural networks on +simulated data to perform Bayesian inference. While this strategy avoids the +need for tractable likelihoods, it often requires a large number of simulations +and has been challenging to scale to time series data. Scientific simulators +frequently emulate real-world dynamics through thousands of single-state +transitions over time. We propose an SBI approach that can exploit such +Markovian simulators by locally identifying parameters consistent with +individual state transitions. We then compose these local results to obtain a +posterior over parameters that align with the entire time series observation. +We focus on applying this approach to neural posterior score estimation but +also show how it can be applied, e.g., to neural likelihood (ratio) estimation. +We demonstrate that our approach is more simulation-efficient than directly +estimating the global posterior on several synthetic benchmark tasks and +simulators used in ecology and epidemiology. Finally, we validate scalability +and simulation efficiency of our approach by applying it to a high-dimensional +Kolmogorov flow simulator with around one million data dimensions. + +
+
+ comment: To be published in the proceedings of the Thirteenth International + Conference on Learning Representations (ICLR 2025), Singapore, 2025 +
+
+
+
+
+ + ♻ ☆ Direct Distributional Optimization for Provable Alignment of Diffusion + Models + + +
+ We introduce a novel alignment method for diffusion models from distribution +optimization perspectives while providing rigorous convergence guarantees. We +first formulate the problem as a generic regularized loss minimization over +probability distributions and directly optimize the distribution using the Dual +Averaging method. Next, we enable sampling from the learned distribution by +approximating its score function via Doob's $h$-transform technique. The +proposed framework is supported by rigorous convergence guarantees and an +end-to-end bound on the sampling error, which imply that when the original +distribution's score is known accurately, the complexity of sampling from +shifted distributions is independent of isoperimetric conditions. This +framework is broadly applicable to general distribution optimization problems, +including alignment tasks in Reinforcement Learning with Human Feedback (RLHF), +Direct Preference Optimization (DPO), and Kahneman-Tversky Optimization (KTO). +We empirically validate its performance on synthetic and image datasets using +the DPO objective. + +
+
+
+
+
+ + ♻ ☆ Iterative Nash Policy Optimization: Aligning LLMs with General + Preferences via No-Regret Learning + + +
+ Reinforcement Learning with Human Feedback (RLHF) has achieved great success +in aligning large language models (LLMs) with human preferences. Prevalent RLHF +approaches are reward-based, following the Bradley-Terry (BT) model assumption, +which may not fully capture the complexity of human preferences. In this paper, +we explore RLHF under a general preference framework and approach it from a +game-theoretic perspective. Specifically, we formulate the problem as a +two-player game and propose a novel online algorithm, iterative Nash policy +optimization (INPO). The key idea is to let the policy play against itself via +no-regret learning, thereby approximating the Nash policy. Unlike previous +methods, INPO bypasses the need for estimating the expected win rate for +individual responses, which typically incurs high computational or annotation +costs. Instead, we introduce a new loss objective that is directly minimized +over a preference dataset. We provide theoretical analysis for our approach and +demonstrate its effectiveness through experiments on various representative +benchmarks. With an LLaMA-3-8B-based SFT model, INPO achieves a 42.6% +length-controlled win rate on AlpacaEval 2.0 and a 37.8% win rate on +Arena-Hard, showing substantial improvement over the state-of-the-art online +RLHF algorithms. + +
+
+
+
+
+ + ♻ ☆ Discovering physical laws with parallel combinatorial tree search + + +
+ Symbolic regression plays a crucial role in modern scientific research thanks +to its capability of discovering concise and interpretable mathematical +expressions from data. A grand challenge lies in the arduous search for +parsimonious and generalizable mathematical formulas, in an infinite search +space, while intending to fit the training data. Existing algorithms have faced +a critical bottleneck of accuracy and efficiency over a decade when handling +problems of complexity, which essentially hinders the pace of applying symbolic +regression for scientific exploration across interdisciplinary domains. To this +end, we introduce a parallel combinatorial tree search (PCTS) model to +efficiently distill generic mathematical expressions from limited data. Through +a series of extensive experiments, we demonstrate the superior accuracy and +efficiency of PCTS for equation discovery, which greatly outperforms the +state-of-the-art baseline models on over 200 synthetic and experimental +datasets (e.g., lifting its performance by up to 99% accuracy improvement and +one-order of magnitude speed up). PCTS represents a key advance in accurate and +efficient data-driven discovery of symbolic, interpretable models (e.g., +underlying physical laws) and marks a pivotal transition towards scalable +symbolic learning. + +
+
+
+
+
+ + ♻ ☆ Learning to Learn Weight Generation via Trajectory Diffusion + + +
+ Diffusion-based algorithms have emerged as promising techniques for weight +generation, particularly in scenarios like multi-task learning that require +frequent weight updates. However, existing solutions suffer from limited +cross-task transferability. In addition, they only utilize optimal weights as +training samples, ignoring the value of other weights in the optimization +process. To address these issues, we propose Lt-Di, which integrates the +diffusion algorithm with meta-learning to generate weights for unseen tasks. +Furthermore, we extend the vanilla diffusion algorithm into a trajectory +diffusion algorithm to utilize other weights along the optimization trajectory. +Trajectory diffusion decomposes the entire diffusion chain into multiple +shorter ones, improving training and inference efficiency. We analyze the +convergence properties of the weight generation paradigm and improve +convergence efficiency without additional time overhead. Our experiments +demonstrate Lt-Di's higher accuracy while reducing computational overhead +across various tasks, including zero-shot and few-shot learning, multi-domain +generalization, and large-scale language model fine-tuning.Our code is released +at https://anonymous.4open.science/r/Lt-Di-0E51. + +
+
+
+
+
+ + ♻ ☆ Struc2mapGAN: improving synthetic cryo-EM density maps with generative + adversarial networks + + +
+ Generating synthetic cryogenic electron microscopy 3D density maps from +molecular structures has potential important applications in structural +biology. Yet existing simulation-based methods cannot mimic all the complex +features present in experimental maps, such as secondary structure elements. As +an alternative, we propose struc2mapGAN, a novel data-driven method that +employs a generative adversarial network to produce improved experimental-like +density maps from molecular structures. More specifically, struc2mapGAN uses a +nested U-Net architecture as the generator, with an additional L1 loss term and +further processing of raw training experimental maps to enhance learning +efficiency. While struc2mapGAN can promptly generate maps after training, we +demonstrate that it outperforms existing simulation-based methods for a wide +array of tested maps and across various evaluation metrics. + +
+
+
+
+
+ + ♻ ☆ LLMOPT: Learning to Define and Solve General Optimization Problems from + Scratch + + +
+ Optimization problems are prevalent across various scenarios. Formulating and +then solving optimization problems described by natural language often requires +highly specialized human expertise, which could block the widespread +application of optimization-based decision making. To automate problem +formulation and solving, leveraging large language models (LLMs) has emerged as +a potential way. However, this kind of approach suffers from the issue of +optimization generalization. Namely, the accuracy of most current LLM-based +methods and the generality of optimization problem types that they can model +are still limited. In this paper, we propose a unified learning-based framework +called LLMOPT to boost optimization generalization. Starting from the natural +language descriptions of optimization problems and a pre-trained LLM, LLMOPT +constructs the introduced five-element formulation as a universal model for +learning to define diverse optimization problem types. Then, LLMOPT employs the +multi-instruction tuning to enhance both problem formalization and solver code +generation accuracy and generality. After that, to prevent hallucinations in +LLMs, such as sacrificing solving accuracy to avoid execution errors, the model +alignment and self-correction mechanism are adopted in LLMOPT. We evaluate the +optimization generalization ability of LLMOPT and compared methods across six +real-world datasets covering roughly 20 fields such as health, environment, +energy and manufacturing, etc. Extensive experiment results show that LLMOPT is +able to model various optimization problem types such as linear/nonlinear +programming, mixed integer programming, and combinatorial optimization, and +achieves a notable 11.08% average solving accuracy improvement compared with +the state-of-the-art methods. The code is available at +https://github.com/caigaojiang/LLMOPT. + +
+
+
+
+
+ + ♻ ☆ TSVD: Bridging Theory and Practice in Continual Learning with + Pre-trained Models ICLR 2025 + + +
+ The goal of continual learning (CL) is to train a model that can solve +multiple tasks presented sequentially. Recent CL approaches have achieved +strong performance by leveraging large pre-trained models that generalize well +to downstream tasks. However, such methods lack theoretical guarantees, making +them prone to unexpected failures. Conversely, principled CL approaches often +fail to achieve competitive performance. In this work, we aim to bridge this +gap between theory and practice by designing a simple CL method that is +theoretically sound and highly performant. Specifically, we lift pre-trained +features into a higher dimensional space and formulate an over-parametrized +minimum-norm least-squares problem. We find that the lifted features are highly +ill-conditioned, potentially leading to large training errors (numerical +instability) and increased generalization errors. We address these challenges +by continually truncating the singular value decomposition (SVD) of the lifted +features. Our approach, termed TSVD, is stable with respect to the choice of +hyperparameters, can handle hundreds of tasks, and outperforms state-of-the-art +CL methods on multiple datasets. Importantly, our method satisfies a recurrence +relation throughout its continual learning process, which allows us to prove it +maintains small training and generalization errors by appropriately truncating +a fraction of SVD factors. This results in a stable continual learning method +with strong empirical performance and theoretical guarantees. Code available: +https://github.com/liangzu/tsvd. + +
+
+ comment: 47 pages, 18 figures, 16 tables (v2, accepted to ICLR 2025) +
+
+
+
+
+ + ♻ ☆ MMed-RAG: Versatile Multimodal RAG System for Medical Vision Language + Models ICLR 2025 + + +
+ Artificial Intelligence (AI) has demonstrated significant potential in +healthcare, particularly in disease diagnosis and treatment planning. Recent +progress in Medical Large Vision-Language Models (Med-LVLMs) has opened up new +possibilities for interactive diagnostic tools. However, these models often +suffer from factual hallucination, which can lead to incorrect diagnoses. +Fine-tuning and retrieval-augmented generation (RAG) have emerged as methods to +address these issues. However, the amount of high-quality data and distribution +shifts between training data and deployment data limit the application of +fine-tuning methods. Although RAG is lightweight and effective, existing +RAG-based approaches are not sufficiently general to different medical domains +and can potentially cause misalignment issues, both between modalities and +between the model and the ground truth. In this paper, we propose a versatile +multimodal RAG system, MMed-RAG, designed to enhance the factuality of +Med-LVLMs. Our approach introduces a domain-aware retrieval mechanism, an +adaptive retrieved contexts selection method, and a provable RAG-based +preference fine-tuning strategy. These innovations make the RAG process +sufficiently general and reliable, significantly improving alignment when +introducing retrieved contexts. Experimental results across five medical +datasets (involving radiology, ophthalmology, pathology) on medical VQA and +report generation demonstrate that MMed-RAG can achieve an average improvement +of 43.8% in the factual accuracy of Med-LVLMs. Our data and code are available +in https://github.com/richard-peng-xia/MMed-RAG. + +
+
+ comment: ICLR 2025 +
+
+
+
+
+ + ♻ ☆ HORAE: A Domain-Agnostic Modeling Language for Automating Multimodal + Service Regulation + + +
+ Artificial intelligence is rapidly encroaching on the field of service +regulation. This work-in-progress article presents the design principles behind +HORAE, a unified specification language to model multimodal regulation rules +across a diverse set of domains. We show how HORAE facilitates an intelligent +service regulation pipeline by further exploiting a fine-tuned large language +model named HORAE that automates the HORAE modeling process, thereby yielding +an end-to-end framework for fully automated intelligent service regulation. + +
+
+
+
+
+ + ♻ ☆ Scaling Offline Model-Based RL via Jointly-Optimized World-Action Model + Pretraining ICLR 2025 + + +
+ A significant aspiration of offline reinforcement learning (RL) is to develop +a generalist agent with high capabilities from large and heterogeneous +datasets. However, prior approaches that scale offline RL either rely heavily +on expert trajectories or struggle to generalize to diverse unseen tasks. +Inspired by the excellent generalization of world model in conditional video +generation, we explore the potential of image observation-based world model for +scaling offline RL and enhancing generalization on novel tasks. In this paper, +we introduce JOWA: Jointly-Optimized World-Action model, an offline model-based +RL agent pretrained on multiple Atari games with 6 billion tokens data to learn +general-purpose representation and decision-making ability. Our method jointly +optimizes a world-action model through a shared transformer backbone, which +stabilize temporal difference learning with large models during pretraining. +Moreover, we propose a provably efficient and parallelizable planning algorithm +to compensate for the Q-value estimation error and thus search out better +policies. Experimental results indicate that our largest agent, with 150 +million parameters, achieves 78.9% human-level performance on pretrained games +using only 10% subsampled offline data, outperforming existing state-of-the-art +large-scale offline RL baselines by 31.6% on averange. Furthermore, JOWA scales +favorably with model capacity and can sample-efficiently transfer to novel +games using only 5k offline fine-tuning data (approximately 4 trajectories) per +game, demonstrating superior generalization. We will release codes and model +weights at https://github.com/CJReinforce/JOWA + +
+
+ comment: Accepted by ICLR 2025 +
+
+
+
+
+ + ♻ ☆ PhyMPGN: Physics-encoded Message Passing Graph Network for + spatiotemporal PDE systems + + +
+ Solving partial differential equations (PDEs) serves as a cornerstone for +modeling complex dynamical systems. Recent progresses have demonstrated grand +benefits of data-driven neural-based models for predicting spatiotemporal +dynamics (e.g., tremendous speedup gain compared with classical numerical +methods). However, most existing neural models rely on rich training data, have +limited extrapolation and generalization abilities, and suffer to produce +precise or reliable physical prediction under intricate conditions (e.g., +irregular mesh or geometry, complex boundary conditions, diverse PDE +parameters, etc.). To this end, we propose a new graph learning approach, +namely, Physics-encoded Message Passing Graph Network (PhyMPGN), to model +spatiotemporal PDE systems on irregular meshes given small training datasets. +Specifically, we incorporate a GNN into a numerical integrator to approximate +the temporal marching of spatiotemporal dynamics for a given PDE system. +Considering that many physical phenomena are governed by diffusion processes, +we further design a learnable Laplace block, which encodes the discrete +Laplace-Beltrami operator, to aid and guide the GNN learning in a physically +feasible solution space. A boundary condition padding strategy is also designed +to improve the model convergence and accuracy. Extensive experiments +demonstrate that PhyMPGN is capable of accurately predicting various types of +spatiotemporal dynamics on coarse unstructured meshes, consistently achieves +the state-of-the-art results, and outperforms other baselines with considerable +gains. + +
+
+
+
+
+ + ♻ ☆ A Closer Look at Machine Unlearning for Large Language Models ICLR 2025 + + +
+ Large language models (LLMs) may memorize sensitive or copyrighted content, +raising privacy and legal concerns. Due to the high cost of retraining from +scratch, researchers attempt to employ machine unlearning to remove specific +content from LLMs while preserving the overall performance. In this paper, we +discuss several issues in machine unlearning for LLMs and provide our insights +on possible approaches. To address the issue of inadequate evaluation of model +outputs after unlearning, we introduce three additional metrics to evaluate +token diversity, sentence semantics, and factual correctness. We then +categorize unlearning methods into untargeted and targeted, and discuss their +issues respectively. Specifically, the behavior that untargeted unlearning +attempts to approximate is unpredictable and may involve hallucinations, and +existing regularization is insufficient for targeted unlearning. To alleviate +these issues, we propose using the objective of maximizing entropy (ME) for +untargeted unlearning and incorporate answer preservation (AP) loss as +regularization for targeted unlearning. Experimental results across three +scenarios, i.e., fictitious unlearning, continual unlearning, and real-world +unlearning, demonstrate the effectiveness of our approaches. The code is +available at https://github.com/sail-sg/closer-look-LLM-unlearning. + +
+
+ comment: ICLR 2025 +
+
+
+
+
+ + ♻ ☆ A Lean Dataset for International Math Olympiad: Small Steps towards + Writing Math Proofs for Hard Problems + + +
+ Using AI to write formal proofs for mathematical problems is a challenging +task that has seen some advancements in recent years. Automated systems such as +Lean can verify the correctness of proofs written in formal language, yet +writing the proofs in formal language can be challenging for humans and +machines. The miniF2F benchmark has 20 IMO problems in its test set, yet formal +proofs are available only for 6 of these problems (3 of which are only written +by mathematicians). The model with best accuracy can only prove 2 of these 20 +IMO problems, from 1950s and 60s, while its training set is a secret. In this +work, we write complete, original formal proofs for the remaining IMO problems +in Lean along with 3 extra problems from IMO 2022 and 2023. This effort expands +the availability of proof currently in the public domain by creating 5,880 +lines of Lean proof. The goal of the paper is to pave the way for developing AI +models that can automatically write the formal proofs for all the IMO problems +in miniF2F and beyond by providing an evaluation benchmark. In this pursuit, we +devise a method to decompose the proofs of these problems into their building +blocks, constructing a dataset of 1,329 lemmas with more than 40k lines of Lean +code. These lemmas are not trivial, yet they are approachable, providing the +opportunity to evaluate and diagnose the failures and successes of AI models. +We evaluate the ability of the SOTA LLMs on our dataset and analyze their +success and failure modes from different perspectives. Our dataset and code is +available at: https://github.com/roozbeh-yz/IMO-Steps. + +
+
+
+
+
+ + ♻ ☆ Scalable Decision-Making in Stochastic Environments through Learned + Temporal Abstraction ICLR2025 + + +
+ Sequential decision-making in high-dimensional continuous action spaces, +particularly in stochastic environments, faces significant computational +challenges. We explore this challenge in the traditional offline RL setting, +where an agent must learn how to make decisions based on data collected through +a stochastic behavior policy. We present Latent Macro Action Planner (L-MAP), +which addresses this challenge by learning a set of temporally extended +macro-actions through a state-conditional Vector Quantized Variational +Autoencoder (VQ-VAE), effectively reducing action dimensionality. L-MAP employs +a (separate) learned prior model that acts as a latent transition model and +allows efficient sampling of plausible actions. During planning, our approach +accounts for stochasticity in both the environment and the behavior policy by +using Monte Carlo tree search (MCTS). In offline RL settings, including +stochastic continuous control tasks, L-MAP efficiently searches over discrete +latent actions to yield high expected returns. Empirical results demonstrate +that L-MAP maintains low decision latency despite increased action +dimensionality. Notably, across tasks ranging from continuous control with +inherently stochastic dynamics to high-dimensional robotic hand manipulation, +L-MAP significantly outperforms existing model-based methods and performs +on-par with strong model-free actor-critic baselines, highlighting the +effectiveness of the proposed approach in planning in complex and stochastic +environments with high-dimensional action spaces. + +
+
+ comment: Accepted by ICLR2025. Code would be available at + https://github.com/BaitingLuo/L-MAP.git +
+
+
+
+
+ + ♻ ☆ On the Feature Learning in Diffusion Models + + +
+ The predominant success of diffusion models in generative modeling has +spurred significant interest in understanding their theoretical foundations. In +this work, we propose a feature learning framework aimed at analyzing and +comparing the training dynamics of diffusion models with those of traditional +classification models. Our theoretical analysis demonstrates that diffusion +models, due to the denoising objective, are encouraged to learn more balanced +and comprehensive representations of the data. In contrast, neural networks +with a similar architecture trained for classification tend to prioritize +learning specific patterns in the data, often focusing on easy-to-learn +components. To support these theoretical insights, we conduct several +experiments on both synthetic and real-world datasets, which empirically +validate our findings and highlight the distinct feature learning dynamics in +diffusion models compared to classification. + +
+
+
+
+
+ + ♻ ☆ Weighted Point Set Embedding for Multimodal Contrastive Learning Toward + Optimal Similarity Metric ICLR 2025 + + +
+ In typical multimodal contrastive learning, such as CLIP, encoders produce +one point in the latent representation space for each input. However, one-point +representation has difficulty in capturing the relationship and the similarity +structure of a huge amount of instances in the real world. For richer classes +of the similarity, we propose the use of weighted point sets, namely, sets of +pairs of weight and vector, as representations of instances. In this work, we +theoretically show the benefit of our proposed method through a new +understanding of the contrastive loss of CLIP, which we call symmetric InfoNCE. +We clarify that the optimal similarity that minimizes symmetric InfoNCE is the +pointwise mutual information, and show an upper bound of excess risk on +downstream classification tasks of representations that achieve the optimal +similarity. In addition, we show that our proposed similarity based on weighted +point sets consistently achieves the optimal similarity. To verify the +effectiveness of our proposed method, we demonstrate pretraining of text-image +representation models and classification tasks on common benchmarks. + +
+
+ comment: ICLR 2025 (Spotlight) +
+
+
+
+
+ + ♻ ☆ Penalized Principal Component Analysis Using Smoothing + + +
+ Principal components computed via PCA (principal component analysis) are +traditionally used to reduce dimensionality in genomic data or to correct for +population stratification. In this paper, we explore the penalized eigenvalue +problem (PEP) which reformulates the computation of the first eigenvector as an +optimization problem and adds an $L_1$ penalty constraint to enforce sparseness +of the solution. The contribution of our article is threefold. First, we extend +PEP by applying smoothing to the original LASSO-type $L_1$ penalty. This allows +one to compute analytical gradients which enable faster and more efficient +minimization of the objective function associated with the optimization +problem. Second, we demonstrate how higher order eigenvectors can be calculated +with PEP using established results from singular value decomposition (SVD). +Third, we present four experimental studies to demonstrate the usefulness of +the smoothed penalized eigenvectors. Using data from the 1000 Genomes Project +dataset, we empirically demonstrate that our proposed smoothed PEP allows one +to increase numerical stability and obtain meaningful eigenvectors. We also +employ the penalized eigenvector approach in two additional real data +applications (computation of a polygenic risk score and clustering), +demonstrating that exchanging the penalized eigenvectors for their smoothed +counterparts can increase prediction accuracy in polygenic risk scores and +enhance discernibility of clusterings. Moreover, we compare our proposed +smoothed PEP to seven state-of-the-art algorithms for sparse PCA and evaluate +the accuracy of the obtained eigenvectors, their support recovery, and their +runtime. + +
+
+
+
+
+ + ♻ ☆ OLMoE: Open Mixture-of-Experts Language Models + + +
+ We introduce OLMoE, a fully open, state-of-the-art language model leveraging +sparse Mixture-of-Experts (MoE). OLMoE-1B-7B has 7 billion (B) parameters but +uses only 1B per input token. We pretrain it on 5 trillion tokens and further +adapt it to create OLMoE-1B-7B-Instruct. Our models outperform all available +models with similar active parameters, even surpassing larger ones like +Llama2-13B-Chat and DeepSeekMoE-16B. We present various experiments on MoE +training, analyze routing in our model showing high specialization, and +open-source all aspects of our work: model weights, training data, code, and +logs. + +
+
+ comment: 63 pages (24 main), 36 figures, 17 tables +
+
+
+
+
+ + ♻ ☆ On Large Language Model Continual Unlearning ICLR 2025 + + +
+ While large language models have demonstrated impressive performance across +various domains and tasks, their security issues have become increasingly +severe. Machine unlearning has emerged as a representative approach for model +safety and security by removing the influence of undesired data on the target +model. However, these methods do not sufficiently consider that unlearning +requests in real-world scenarios are continuously emerging, especially in the +context of LLMs, which may lead to accumulated model utility loss that +eventually becomes unacceptable. Moreover, existing LLM unlearning methods +often ignore previous data access limitations due to privacy concerns and +copyright protection. Without previous data, the utility preservation during +unlearning is much harder. To overcome these challenges, we propose the OOO +framework that includes an Orthogonal low-rank adapter (LoRA) for continually +unlearning requested data and an Out-Of-Distribution (OOD) detector to measure +the similarity between input and unlearning data. The orthogonal LoRA achieves +parameter disentanglement among continual unlearning requests. The OOD detector +is trained with a novel contrastive entropy loss and utilizes a glocal-aware +scoring mechanism. During inference, our OOO framework can decide whether and +to what extent to load the unlearning LoRA based on the OOD detector's +predicted similarity between the input and the unlearned knowledge. Notably, +OOO's effectiveness does not rely on any retained data. We conducted extensive +experiments on OOO and state-of-the-art LLM unlearning methods across three +tasks and seven datasets. The results indicate that OOO consistently achieves +the best unlearning effectiveness and utility preservation, especially when +facing continuous unlearning requests. The source codes can be found at +https://github.com/GCYZSL/O3-LLM-UNLEARNING. + +
+
+ comment: This paper has been accepted by ICLR 2025. The first two authors + contribute equally and they are ordered alphabetically +
+
+
+
+
+ + ♻ ☆ BECAUSE: Bilinear Causal Representation for Generalizable Offline + Model-based Reinforcement Learning + + +
+ Offline model-based reinforcement learning (MBRL) enhances data efficiency by +utilizing pre-collected datasets to learn models and policies, especially in +scenarios where exploration is costly or infeasible. Nevertheless, its +performance often suffers from the objective mismatch between model and policy +learning, resulting in inferior performance despite accurate model predictions. +This paper first identifies the primary source of this mismatch comes from the +underlying confounders present in offline data for MBRL. Subsequently, we +introduce \textbf{B}ilin\textbf{E}ar \textbf{CAUS}al +r\textbf{E}presentation~(BECAUSE), an algorithm to capture causal +representation for both states and actions to reduce the influence of the +distribution shift, thus mitigating the objective mismatch problem. +Comprehensive evaluations on 18 tasks that vary in data quality and environment +context demonstrate the superior performance of BECAUSE over existing offline +RL algorithms. We show the generalizability and robustness of BECAUSE under +fewer samples or larger numbers of confounders. Additionally, we offer +theoretical analysis of BECAUSE to prove its error bound and sample efficiency +when integrating causal representation into offline MBRL. + +
+
+
+
+
+ + ♻ ☆ RobotFingerPrint: Unified Gripper Coordinate Space for Multi-Gripper + Grasp Synthesis and Transfer + + +
+ We introduce a novel grasp representation named the Unified Gripper +Coordinate Space (UGCS) for grasp synthesis and grasp transfer. Our +representation leverages spherical coordinates to create a shared coordinate +space across different robot grippers, enabling it to synthesize and transfer +grasps for both novel objects and previously unseen grippers. The strength of +this representation lies in the ability to map palm and fingers of a gripper +and the unified coordinate space. Grasp synthesis is formulated as predicting +the unified spherical coordinates on object surface points via a conditional +variational autoencoder. The predicted unified gripper coordinates establish +exact correspondences between the gripper and object points, which is used to +optimize grasp pose and joint values. Grasp transfer is facilitated through the +point-to-point correspondence between any two (potentially unseen) grippers and +solved via a similar optimization. Extensive simulation and real-world +experiments showcase the efficacy of the unified grasp representation for grasp +synthesis in generating stable and diverse grasps. Similarly, we showcase +real-world grasp transfer from human demonstrations across different objects. + +
+
+ comment: 8 pages, 11 figures, 3 tables. Project page available at + https://irvlutd.github.io/RobotFingerPrint +
+
+
+
+
+ + ♻ ☆ The Labyrinth of Links: Navigating the Associative Maze of Multi-modal + LLMs ICLR 2025 + + +
+ Multi-modal Large Language Models (MLLMs) have exhibited impressive +capability. However, recently many deficiencies of MLLMs have been found +compared to human intelligence, $\textit{e.g.}$, hallucination. To drive the +MLLMs study, the community dedicated efforts to building larger benchmarks with +complex tasks. In this paper, we propose benchmarking an essential but usually +overlooked intelligence: $\textbf{association}$, a human's basic capability to +link observation and prior practice memory. To comprehensively investigate +MLLM's performance on the association, we formulate the association task and +devise a standard benchmark based on adjective and verb semantic concepts. +Instead of costly data annotation and curation, we propose a convenient +$\textbf{annotation-free}$ construction method transforming the general dataset +for our association tasks. Simultaneously, we devise a rigorous data refinement +process to eliminate confusion in the raw dataset. Building on this database, +we establish three levels of association tasks: single-step, synchronous, and +asynchronous associations. Moreover, we conduct a comprehensive investigation +into the MLLMs' zero-shot association capabilities, addressing multiple +dimensions, including three distinct memory strategies, both open-source and +closed-source MLLMs, cutting-edge Mixture-of-Experts (MoE) models, and the +involvement of human experts. Our systematic investigation shows that current +open-source MLLMs consistently exhibit poor capability in our association +tasks, even the currently state-of-the-art GPT-4V(vision) also has a +significant gap compared to humans. We believe our benchmark would pave the way +for future MLLM studies. $\textit{Our data and code are available at:}$ +https://mvig-rhos.com/llm_inception. + +
+
+ comment: Accepted by ICLR 2025. Project page: + https://mvig-rhos.com/llm_inception +
+
+
+
+
+ + ♻ ☆ NL2FOL: Translating Natural Language to First-Order Logic for Logical + Fallacy Detection + + +
+ Translating natural language into formal language such as First-Order Logic +(FOL) is a foundational challenge in NLP with wide-ranging applications in +automated reasoning, misinformation tracking, and knowledge validation. In this +paper, we introduce Natural Language to First-Order Logic (NL2FOL), a framework +to autoformalize natural language to FOL step by step using Large Language +Models (LLMs). Our approach addresses key challenges in this translation +process, including the integration of implicit background knowledge. By +leveraging structured representations generated by NL2FOL, we use +Satisfiability Modulo Theory (SMT) solvers to reason about the logical validity +of natural language statements. We present logical fallacy detection as a case +study to evaluate the efficacy of NL2FOL. Being neurosymbolic, our approach +also provides interpretable insights into the reasoning process and +demonstrates robustness without requiring model fine-tuning or labeled training +data. Our framework achieves strong performance on multiple datasets. On the +LOGIC dataset, NL2FOL achieves an F1-score of 78%, while generalizing +effectively to the LOGICCLIMATE dataset with an F1-score of 80%. + +
+
+
+
+
+ + ♻ ☆ Identifying Drift, Diffusion, and Causal Structure from Temporal + Snapshots + + +
+ Stochastic differential equations (SDEs) are a fundamental tool for modelling +dynamic processes, including gene regulatory networks (GRNs), contaminant +transport, financial markets, and image generation. However, learning the +underlying SDE from data is a challenging task, especially if individual +trajectories are not observable. Motivated by burgeoning research in +single-cell datasets, we present the first comprehensive approach for jointly +identifying the drift and diffusion of an SDE from its temporal marginals. +Assuming linear drift and additive diffusion, we prove that these parameters +are identifiable from marginals if and only if the initial distribution lacks +any generalized rotational symmetries. We further prove that the causal graph +of any SDE with additive diffusion can be recovered from the SDE parameters. To +complement this theory, we adapt entropy-regularized optimal transport to +handle anisotropic diffusion, and introduce APPEX (Alternating Projection +Parameter Estimation from $X_0$), an iterative algorithm designed to estimate +the drift, diffusion, and causal graph of an additive noise SDE, solely from +temporal marginals. We show that APPEX iteratively decreases Kullback-Leibler +divergence to the true solution, and demonstrate its effectiveness on simulated +data from linear additive noise SDEs. + +
+
+
+
+
+ + ♻ ☆ Does SGD really happen in tiny subspaces? ICLR 2025 + + +
+ Understanding the training dynamics of deep neural networks is challenging +due to their high-dimensional nature and intricate loss landscapes. Recent +studies have revealed that, along the training trajectory, the gradient +approximately aligns with a low-rank top eigenspace of the training loss +Hessian, referred to as the dominant subspace. Given this alignment, this paper +explores whether neural networks can be trained within the dominant subspace, +which, if feasible, could lead to more efficient training methods. Our primary +observation is that when the SGD update is projected onto the dominant +subspace, the training loss does not decrease further. This suggests that the +observed alignment between the gradient and the dominant subspace is spurious. +Surprisingly, projecting out the dominant subspace proves to be just as +effective as the original update, despite removing the majority of the original +update component. We observe similar behavior across practical setups, +including the large learning rate regime (also known as Edge of Stability), +Sharpness-Aware Minimization, momentum, and adaptive optimizers. We discuss the +main causes and implications of this spurious alignment, shedding light on the +dynamics of neural network training. + +
+
+ comment: Published at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Asymptotic Behavior of Adversarial Training Estimator under + $\ell_\infty$-Perturbation + + +
+ Adversarial training has been proposed to protect machine learning models +against adversarial attacks. This paper focuses on adversarial training under +$\ell_\infty$-perturbation, which has recently attracted much research +attention. The asymptotic behavior of the adversarial training estimator is +investigated in the generalized linear model. The results imply that the +asymptotic distribution of the adversarial training estimator under +$\ell_\infty$-perturbation could put a positive probability mass at $0$ when +the true parameter is $0$, providing a theoretical guarantee of the associated +sparsity-recovery ability. Alternatively, a two-step procedure is proposed -- +adaptive adversarial training, which could further improve the performance of +adversarial training under $\ell_\infty$-perturbation. Specifically, the +proposed procedure could achieve asymptotic variable-selection consistency and +unbiasedness. Numerical experiments are conducted to show the sparsity-recovery +ability of adversarial training under $\ell_\infty$-perturbation and to compare +the empirical performance between classic adversarial training and adaptive +adversarial training. + +
+
+
+
+
+
+
+
+ + Multimedia 8 + +
+
+
+ + ☆ Recurrence-Enhanced Vision-and-Language Transformers for Robust + Multimodal Document Retrieval CVPR 2025 + + +
+ Cross-modal retrieval is gaining increasing efficacy and interest from the +research community, thanks to large-scale training, novel architectural and +learning designs, and its application in LLMs and multimodal LLMs. In this +paper, we move a step forward and design an approach that allows for multimodal +queries, composed of both an image and a text, and can search within +collections of multimodal documents, where images and text are interleaved. Our +model, ReT, employs multi-level representations extracted from different layers +of both visual and textual backbones, both at the query and document side. To +allow for multi-level and cross-modal understanding and feature extraction, ReT +employs a novel Transformer-based recurrent cell that integrates both textual +and visual features at different layers, and leverages sigmoidal gates inspired +by the classical design of LSTMs. Extensive experiments on M2KR and M-BEIR +benchmarks show that ReT achieves state-of-the-art performance across diverse +settings. Our source code and trained models are publicly available at +https://github.com/aimagelab/ReT. + +
+
+ comment: CVPR 2025 +
+
+
+
+
+ + ☆ Improving the Efficiency of VVC using Partitioning of Reference Frames + + +
+ In response to the growing demand for high-quality videos, Versatile Video +Coding (VVC) was released in 2020, building on the hybrid coding architecture +of its predecessor, HEVC, achieving about 50% bitrate reduction for the same +visual quality. It introduces more flexible block partitioning, enhancing +compression efficiency at the cost of increased encoding complexity. To make +efficient use of VVC in practical applications, optimization is essential. +VVenC, an optimized open-source VVC encoder, introduces multiple presets to +address the trade-off between compression efficiency and encoder complexity. +Although an optimized set of encoding tools has been selected for each preset, +the rate-distortion (RD) search space in the encoder presets still poses a +challenge for efficient encoder implementations. In this paper, we propose +Early Termination using Reference Frames (ETRF), which improves the trade-off +between encoding efficiency and time complexity and positions itself as a new +preset between medium and fast presets. The CTU partitioning map of the +reference frames in lower temporal layers is employed to accelerate the +encoding of frames in higher temporal layers. The results show a reduction in +the encoding time of around 21% compared to the medium preset. Specifically, +for videos with high spatial and temporal complexities, which typically require +longer encoding times, the proposed method achieves a better trade-off between +bitrate savings and encoding time compared to the fast preset. + +
+
+
+
+
+ + ☆ Multi-resolution Encoding for HTTP Adaptive Streaming using VVenC + + +
+ HTTP Adaptive Streaming (HAS) is a widely adopted method for delivering video +content over the Internet, requiring each video to be encoded at multiple +bitrates and resolution pairs, known as representations, to adapt to various +network conditions and device capabilities. This multi-bitrate encoding +introduces significant challenges due to the computational and time-intensive +nature of encoding multiple representations. Conventional approaches often +encode these videos independently without leveraging similarities between +different representations of the same input video. This paper proposes an +accelerated multi-resolution encoding strategy that utilizes representations of +lower resolutions as references to speed up the encoding of higher resolutions +when using Versatile Video Coding (VVC); specifically in VVenC, an optimized +open-source software implementation. For multi-resolution encoding, a +mid-bitrate representation serves as the reference, allowing interpolated +encoded partition data to efficiently guide the partitioning process in higher +resolutions. The proposed approach uses shared encoding information to reduce +redundant calculations, optimizing partitioning decisions. Experimental results +demonstrate that the proposed technique achieves a reduction of up to 17% +compared to medium preset in encoding time across videos of varying +complexities with minimal BDBR/BDT of 0.12 compared to the fast preset. + +
+
+
+
+
+ + ☆ CorrNetDroid: Android Malware Detector leveraging a Correlation-based + Feature Selection for Network Traffic features + + +
+ Copious mobile operating systems exist in the market, but Android remains the +user's choice. Meanwhile, its growing popularity has also attracted malware +developers. Researchers have proposed various static solutions for Android +malware detection. However, stealthier malware evade static analysis. This +raises the need for a robust Android malware detection system capable of +dealing with advanced threats and overcoming the shortcomings of static +analysis. + Hence, this work proposes a dynamic analysis-based Android malware detection +system, CorrNetDroid, that works over network traffic flows. Many traffic +features exhibit overlapping ranges in normal and malware datasets. Therefore, +we first rank the features using two statistical measures, crRelevance and +Normalized Mean Residue Similarity (NMRS), to assess feature-class and +feature-feature correlations. Thereafter, we introduce a novel +correlation-based feature selection algorithm that applies NMRS on crRelevance +rankings to identify the optimal feature subset for Android malware detection. + Experimental results highlight that our model effectively reduces the feature +set while detecting Android malware with 99.50 percent accuracy when +considering only two network traffic features. Furthermore, our experiments +demonstrate that the NMRS-based algorithm on crRelevance rankings outperforms +statistical tests such as chi-square, ANOVA, Mann-Whitney U test, and +Kruskal-Wallis test. In addition, our model surpasses various state-of-the-art +Android malware detection techniques in terms of detection accuracy. + +
+
+
+
+
+ + ☆ Kiss3DGen: Repurposing Image Diffusion Models for 3D Asset Generation + + +
+ Diffusion models have achieved great success in generating 2D images. +However, the quality and generalizability of 3D content generation remain +limited. State-of-the-art methods often require large-scale 3D assets for +training, which are challenging to collect. In this work, we introduce +Kiss3DGen (Keep It Simple and Straightforward in 3D Generation), an efficient +framework for generating, editing, and enhancing 3D objects by repurposing a +well-trained 2D image diffusion model for 3D generation. Specifically, we +fine-tune a diffusion model to generate ''3D Bundle Image'', a tiled +representation composed of multi-view images and their corresponding normal +maps. The normal maps are then used to reconstruct a 3D mesh, and the +multi-view images provide texture mapping, resulting in a complete 3D model. +This simple method effectively transforms the 3D generation problem into a 2D +image generation task, maximizing the utilization of knowledge in pretrained +diffusion models. Furthermore, we demonstrate that our Kiss3DGen model is +compatible with various diffusion model techniques, enabling advanced features +such as 3D editing, mesh and texture enhancement, etc. Through extensive +experiments, we demonstrate the effectiveness of our approach, showcasing its +ability to produce high-quality 3D models efficiently. + +
+
+ comment: The first three authors contributed equally to this work +
+
+
+
+
+ + ☆ Streaming Piano Transcription Based on Consistent Onset and Offset + Decoding with Sustain Pedal Detection + + +
+ This paper describes a streaming audio-to-MIDI piano transcription approach +that aims to sequentially translate a music signal into a sequence of note +onset and offset events. The sequence-to-sequence nature of this task may call +for the computationally-intensive transformer model for better performance, +which has recently been used for offline transcription benchmarks and could be +extended for streaming transcription with causal attention mechanisms. We +assume that the performance limitation of this naive approach lies in the +decoder. Although time-frequency features useful for onset detection are +considerably different from those for offset detection, the single decoder is +trained to output a mixed sequence of onset and offset events without guarantee +of the correspondence between the onset and offset events of the same note. To +overcome this limitation, we propose a streaming encoder-decoder model that +uses a convolutional encoder aggregating local acoustic features, followed by +an autoregressive Transformer decoder detecting a variable number of onset +events and another decoder detecting the offset events for the active pitches +with validation of the sustain pedal at each time frame. Experiments using the +MAESTRO dataset showed that the proposed streaming method performed comparably +with or even better than the state-of-the-art offline methods while +significantly reducing the computational cost. + +
+
+ comment: Accepted to ISMIR 2024 +
+
+
+
+
+ + ☆ HOP: Heterogeneous Topology-based Multimodal Entanglement for Co-Speech + Gesture Generation CVPR 2025 + + +
+ Co-speech gestures are crucial non-verbal cues that enhance speech clarity +and expressiveness in human communication, which have attracted increasing +attention in multimodal research. While the existing methods have made strides +in gesture accuracy, challenges remain in generating diverse and coherent +gestures, as most approaches assume independence among multimodal inputs and +lack explicit modeling of their interactions. In this work, we propose a novel +multimodal learning method named HOP for co-speech gesture generation that +captures the heterogeneous entanglement between gesture motion, audio rhythm, +and text semantics, enabling the generation of coordinated gestures. By +leveraging spatiotemporal graph modeling, we achieve the alignment of audio and +action. Moreover, to enhance modality coherence, we build the audio-text +semantic representation based on a reprogramming module, which is beneficial +for cross-modality adaptation. Our approach enables the trimodal system to +learn each other's features and represent them in the form of topological +entanglement. Extensive experiments demonstrate that HOP achieves +state-of-the-art performance, offering more natural and expressive co-speech +gesture generation. More information, codes, and demos are available here: +https://star-uu-wang.github.io/HOP/ + +
+
+ comment: Accepted by CVPR 2025. See https://star-uu-wang.github.io/HOP/ +
+
+
+
+
+ + ♻ ☆ FoodMLLM-JP: Leveraging Multimodal Large Language Models for Japanese + Recipe Generation + + +
+ Research on food image understanding using recipe data has been a +long-standing focus due to the diversity and complexity of the data. Moreover, +food is inextricably linked to people's lives, making it a vital research area +for practical applications such as dietary management. Recent advancements in +Multimodal Large Language Models (MLLMs) have demonstrated remarkable +capabilities, not only in their vast knowledge but also in their ability to +handle languages naturally. While English is predominantly used, they can also +support multiple languages including Japanese. This suggests that MLLMs are +expected to significantly improve performance in food image understanding +tasks. We fine-tuned open MLLMs LLaVA-1.5 and Phi-3 Vision on a Japanese recipe +dataset and benchmarked their performance against the closed model GPT-4o. We +then evaluated the content of generated recipes, including ingredients and +cooking procedures, using 5,000 evaluation samples that comprehensively cover +Japanese food culture. Our evaluation demonstrates that the open models trained +on recipe data outperform GPT-4o, the current state-of-the-art model, in +ingredient generation. Our model achieved F1 score of 0.531, surpassing +GPT-4o's F1 score of 0.481, indicating a higher level of accuracy. Furthermore, +our model exhibited comparable performance to GPT-4o in generating cooking +procedure text. + +
+
+ comment: 15 pages, 5 figures. We found errors in the calculation of evaluation + metrics, which were corrected in this version with + $\color{blue}{\text{modifications highlighted in blue}}$. Please also see the + Appendix +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 70 + +
+
+
+ + ♻ ☆ Unsupervised Denoising for Signal-Dependent and Row-Correlated Imaging + Noise + + +
+ Accurate analysis of microscopy images is hindered by the presence of noise. +This noise is usually signal-dependent and often additionally correlated along +rows or columns of pixels. Current self- and unsupervised denoisers can address +signal-dependent noise, but none can reliably remove noise that is also row- or +column-correlated. Here, we present the first fully unsupervised deep +learning-based denoiser capable of handling imaging noise that is +row-correlated as well as signal-dependent. Our approach uses a Variational +Autoencoder (VAE) with a specially designed autoregressive decoder. This +decoder is capable of modeling row-correlated and signal-dependent noise but is +incapable of independently modeling underlying clean signal. The VAE therefore +produces latent variables containing only clean signal information, and these +are mapped back into image space using a proposed second decoder network. Our +method does not require a pre-trained noise model and can be trained from +scratch using unpaired noisy data. We benchmark our approach on microscopy +datatsets from a range of imaging modalities and sensor types, each with row- +or column-correlated, signal-dependent noise, and show that it outperforms +existing self- and unsupervised denoisers. + +
+
+
+
+
+ + ♻ ☆ Eagle: Exploring The Design Space for Multimodal LLMs with Mixture of + Encoders + + +
+ The ability to accurately interpret complex visual information is a crucial +topic of multimodal large language models (MLLMs). Recent work indicates that +enhanced visual perception significantly reduces hallucinations and improves +performance on resolution-sensitive tasks, such as optical character +recognition and document analysis. A number of recent MLLMs achieve this goal +using a mixture of vision encoders. Despite their success, there is a lack of +systematic comparisons and detailed ablation studies addressing critical +aspects, such as expert selection and the integration of multiple vision +experts. This study provides an extensive exploration of the design space for +MLLMs using a mixture of vision encoders and resolutions. Our findings reveal +several underlying principles common to various existing strategies, leading to +a streamlined yet effective design approach. We discover that simply +concatenating visual tokens from a set of complementary vision encoders is as +effective as more complex mixing architectures or strategies. We additionally +introduce Pre-Alignment to bridge the gap between vision-focused encoders and +language tokens, enhancing model coherence. The resulting family of MLLMs, +Eagle, surpasses other leading open-source models on major MLLM benchmarks. + +
+
+ comment: Github: https://github.com/NVlabs/Eagle, HuggingFace: + https://huggingface.co/NVEagle +
+
+
+
+
+ + ♻ ☆ SV-RAG: LoRA-Contextualizing Adaptation of MLLMs for Long Document + Understanding ICLR 2025 + + +
+ Multimodal large language models (MLLMs) have recently shown great progress +in text-rich image understanding, yet they still struggle with complex, +multi-page visually-rich documents. Traditional methods using document parsers +for retrieval-augmented generation suffer from performance and efficiency +limitations, while directly presenting all pages to MLLMs leads to +inefficiencies, especially with lengthy ones. In this work, we present a novel +framework named **S**elf-**V**isual **R**etrieval-**A**ugmented **G**eneration +(SV-RAG), which can broaden horizons of any MLLM to support long-document +understanding. We demonstrate that **MLLMs themselves can be an effective +multimodal retriever** to fetch relevant pages and then answer user questions +based on these pages. SV-RAG is implemented with two specific MLLM adapters, +one for evidence page retrieval and the other for question answering. Empirical +results show state-of-the-art performance on public benchmarks, demonstrating +the effectiveness of SV-RAG. + +
+
+ comment: Accepted to ICLR 2025 +
+
+
+
+
+ + ♻ ☆ ALBAR: Adversarial Learning approach to mitigate Biases in Action + Recognition ICLR 2025 + + +
+ Bias in machine learning models can lead to unfair decision making, and while +it has been well-studied in the image and text domains, it remains +underexplored in action recognition. Action recognition models often suffer +from background bias (i.e., inferring actions based on background cues) and +foreground bias (i.e., relying on subject appearance), which can be detrimental +to real-life applications such as autonomous vehicles or assisted living +monitoring. While prior approaches have mainly focused on mitigating background +bias using specialized augmentations, we thoroughly study both foreground and +background bias. We propose ALBAR, a novel adversarial training method that +mitigates foreground and background biases without requiring specialized +knowledge of the bias attributes. Our framework applies an adversarial +cross-entropy loss to the sampled static clip (where all the frames are the +same) and aims to make its class probabilities uniform using a proposed entropy +maximization loss. Additionally, we introduce a gradient penalty loss for +regularization against the debiasing process. We evaluate our method on +established background and foreground bias protocols, setting a new +state-of-the-art and strongly improving combined debiasing performance by over +12% absolute on HMDB51. Furthermore, we identify an issue of background leakage +in the existing UCF101 protocol for bias evaluation which provides a shortcut +to predict actions and does not provide an accurate measure of the debiasing +capability of a model. We address this issue by proposing more fine-grained +segmentation boundaries for the actor, where our method also outperforms +existing approaches. Project Page: +https://joefioresi718.github.io/ALBAR_webpage/ + +
+
+ comment: Accepted to ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Fréchet Wavelet Distance: A Domain-Agnostic Metric for Image + Generation + + +
+ Modern metrics for generative learning like Fr\'echet Inception Distance +(FID) and DINOv2-Fr\'echet Distance (FD-DINOv2) demonstrate impressive +performance. However, they suffer from various shortcomings, like a bias +towards specific generators and datasets. To address this problem, we propose +the Fr\'echet Wavelet Distance (FWD) as a domain-agnostic metric based on the +Wavelet Packet Transform ($W_p$). FWD provides a sight across a broad spectrum +of frequencies in images with a high resolution, preserving both spatial and +textural aspects. Specifically, we use $W_p$ to project generated and real +images to the packet coefficient space. We then compute the Fr\'echet distance +with the resultant coefficients to evaluate the quality of a generator. This +metric is general-purpose and dataset-domain agnostic, as it does not rely on +any pre-trained network, while being more interpretable due to its ability to +compute Fr\'echet distance per packet, enhancing transparency. We conclude with +an extensive evaluation of a wide variety of generators across various datasets +that the proposed FWD can generalize and improve robustness to domain shifts +and various corruptions compared to other metrics. + +
+
+
+
+
+ + ♻ ☆ TESGNN: Temporal Equivariant Scene Graph Neural Networks for Efficient + and Robust Multi-View 3D Scene Understanding + + +
+ Scene graphs have proven to be highly effective for various scene +understanding tasks due to their compact and explicit representation of +relational information. However, current methods often overlook the critical +importance of preserving symmetry when generating scene graphs from 3D point +clouds, which can lead to reduced accuracy and robustness, particularly when +dealing with noisy, multi-view data. Furthermore, a major limitation of prior +approaches is the lack of temporal modeling to capture time-dependent +relationships among dynamically evolving entities in a scene. To address these +challenges, we propose Temporal Equivariant Scene Graph Neural Network +(TESGNN), consisting of two key components: (1) an Equivariant Scene Graph +Neural Network (ESGNN), which extracts information from 3D point clouds to +generate scene graph while preserving crucial symmetry properties, and (2) a +Temporal Graph Matching Network, which fuses scene graphs generated by ESGNN +across multiple time sequences into a unified global representation using an +approximate graph-matching algorithm. Our combined architecture TESGNN +outperforms current state-of-the-art methods in scene graph generation, +achieving higher accuracy and faster training convergence. Moreover, we show +that leveraging the symmetry-preserving property produces a more stable and +accurate global scene representation compared to existing approaches. Last but +not least, it is computationally efficient and easily implementable using +existing frameworks, making it well-suited for real-time applications in +robotics and computer vision. This approach paves the way for more robust and +scalable solutions to complex multi-view scene understanding challenges. Our +source code is publicly available at: https://github.com/HySonLab/TESGraph + +
+
+ comment: arXiv admin note: text overlap with arXiv:2407.00609 +
+
+
+
+
+ + ♻ ☆ Learning General-Purpose Biomedical Volume Representations using + Randomized Synthesis ICLR 2025 + + +
+ Current volumetric biomedical foundation models struggle to generalize as +public 3D datasets are small and do not cover the broad diversity of medical +procedures, conditions, anatomical regions, and imaging protocols. We address +this by creating a representation learning method that instead anticipates +strong domain shifts at training time itself. We first propose a data engine +that synthesizes highly variable training samples that would enable +generalization to new biomedical contexts. To then train a single 3D network +for any voxel-level task, we develop a contrastive learning method that +pretrains the network to be stable against nuisance imaging variation simulated +by the data engine, a key inductive bias for generalization. This network's +features can be used as robust representations of input images for downstream +tasks and its weights provide a strong, dataset-agnostic initialization for +finetuning on new datasets. As a result, we set new standards across both +multimodality registration and few-shot segmentation, a first for any 3D +biomedical vision model, all without (pre-)training on any existing dataset of +real images. + +
+
+ comment: ICLR 2025: International Conference on Learning Representations. Code + and model weights available at https://github.com/neel-dey/anatomix. + Keywords: synthetic data, representation learning, medical image analysis, + image registration, image segmentation +
+
+
+
+
+ + ♻ ☆ Tri-Clustering: A Multi-views Tri-level Information Fusion Context + Clustering Framework for Localization and Classification in Mammography + + +
+ Breast cancer is a significant global health issue, and the diagnosis of +breast imaging has always been challenging. Mammography images typically have +extremely high resolution, with lesions occupying only a very small area. +Down-sampling in neural networks can easily lead to the loss of +microcalcifications or subtle structures, making it difficult for traditional +neural network architectures to address these issues. To tackle these +challenges, we propose a Context Clustering Network with triple information +fusion. Firstly, compared to CNNs or transformers, we find that Context +clustering methods (1) are more computationally efficient and (2) can more +easily associate structural or pathological features, making them suitable for +the clinical tasks of mammography. Secondly, we propose a triple information +fusion mechanism that integrates global information, feature-based local +information, and patch-based local information. The proposed approach is +rigorously evaluated on two public datasets, Vindr-Mammo and CBIS-DDSM, using +five independent splits to ensure statistical robustness. Our method achieves +an AUC of 0.828 on Vindr-Mammo and 0.805 on CBIS-DDSM, outperforming the next +best method by 3.1% and 2.4%, respectively. These improvements are +statistically significant (p<0.05), underscoring the benefits of Context +Clustering Network with triple information fusion. Overall, our Context +Clustering framework demonstrates strong potential as a scalable and +cost-effective solution for large-scale mammography screening, enabling more +efficient and accurate breast cancer detection. Access to our method is +available at https://github.com/Sohyu1/Mammo_Clustering. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ FedBiP: Heterogeneous One-Shot Federated Learning with Personalized + Latent Diffusion Models CVPR 2025 + + +
+ One-Shot Federated Learning (OSFL), a special decentralized machine learning +paradigm, has recently gained significant attention. OSFL requires only a +single round of client data or model upload, which reduces communication costs +and mitigates privacy threats compared to traditional FL. Despite these +promising prospects, existing methods face challenges due to client data +heterogeneity and limited data quantity when applied to real-world OSFL +systems. Recently, Latent Diffusion Models (LDM) have shown remarkable +advancements in synthesizing high-quality images through pretraining on +large-scale datasets, thereby presenting a potential solution to overcome these +issues. However, directly applying pretrained LDM to heterogeneous OSFL results +in significant distribution shifts in synthetic data, leading to performance +degradation in classification models trained on such data. This issue is +particularly pronounced in rare domains, such as medical imaging, which are +underrepresented in LDM's pretraining data. To address this challenge, we +propose Federated Bi-Level Personalization (FedBiP), which personalizes the +pretrained LDM at both instance-level and concept-level. Hereby, FedBiP +synthesizes images following the client's local data distribution without +compromising the privacy regulations. FedBiP is also the first approach to +simultaneously address feature space heterogeneity and client data scarcity in +OSFL. Our method is validated through extensive experiments on three OSFL +benchmarks with feature space heterogeneity, as well as on challenging medical +and satellite image datasets with label heterogeneity. The results demonstrate +the effectiveness of FedBiP, which substantially outperforms other OSFL +methods. + +
+
+ comment: CVPR 2025 +
+
+
+
+
+ + ♻ ☆ Synthesizing Physically Plausible Human Motions in 3D Scenes 3DV 2024 + + +
+ We present a physics-based character control framework for synthesizing +human-scene interactions. Recent advances adopt physics simulation to mitigate +artifacts produced by data-driven kinematic approaches. However, existing +physics-based methods mainly focus on single-object environments, resulting in +limited applicability in realistic 3D scenes with multi-objects. To address +such challenges, we propose a framework that enables physically simulated +characters to perform long-term interaction tasks in diverse, cluttered, and +unseen 3D scenes. The key idea is to decouple human-scene interactions into two +fundamental processes, Interacting and Navigating, which motivates us to +construct two reusable Controllers, namely InterCon and NavCon. Specifically, +InterCon uses two complementary policies to enable characters to enter or leave +the interacting state with a particular object (e.g., sitting on a chair or +getting up). To realize navigation in cluttered environments, we introduce +NavCon, where a trajectory following policy enables characters to track +pre-planned collision-free paths. Benefiting from the divide and conquer +strategy, we can train all policies in simple environments and directly apply +them in complex multi-object scenes through coordination from a rule-based +scheduler. Video and code are available at +https://github.com/liangpan99/InterScene. + +
+
+ comment: 3DV 2024 version +
+
+
+
+
+ + ♻ ☆ Unleashing the Potential of Vision-Language Pre-Training for 3D + Zero-Shot Lesion Segmentation via Mask-Attribute Alignment ICLR 2025 + + +
+ Recent advancements in medical vision-language pre-training models have +driven significant progress in zero-shot disease recognition. However, +transferring image-level knowledge to pixel-level tasks, such as lesion +segmentation in 3D CT scans, remains a critical challenge. Due to the +complexity and variability of pathological visual characteristics, existing +methods struggle to align fine-grained lesion features not encountered during +training with disease-related textual representations. In this paper, we +present Malenia, a novel multi-scale lesion-level mask-attribute alignment +framework, specifically designed for 3D zero-shot lesion segmentation. Malenia +improves the compatibility between mask representations and their associated +elemental attributes, explicitly linking the visual features of unseen lesions +with the extensible knowledge learned from previously seen ones. Furthermore, +we design a Cross-Modal Knowledge Injection module to enhance both visual and +textual features with mutually beneficial information, effectively guiding the +generation of segmentation results. Comprehensive experiments across three +datasets and 12 lesion categories validate the superior performance of Malenia. + +
+
+ comment: Accepted as ICLR 2025 conference paper +
+
+
+
+
+ + ♻ ☆ Bidirectional Consistency Models ICML 2024 + + +
+ Diffusion models (DMs) are capable of generating remarkably high-quality +samples by iteratively denoising a random vector, a process that corresponds to +moving along the probability flow ordinary differential equation (PF ODE). +Interestingly, DMs can also invert an input image to noise by moving backward +along the PF ODE, a key operation for downstream tasks such as interpolation +and image editing. However, the iterative nature of this process restricts its +speed, hindering its broader application. Recently, Consistency Models (CMs) +have emerged to address this challenge by approximating the integral of the PF +ODE, largely reducing the number of iterations. Yet, the absence of an explicit +ODE solver complicates the inversion process. To resolve this, we introduce +Bidirectional Consistency Model (BCM), which learns a single neural network +that enables both forward and backward traversal along the PF ODE, efficiently +unifying generation and inversion tasks within one framework. We can train BCM +from scratch or tune it using a pretrained consistency model, which reduces the +training cost and increases scalability. We demonstrate that BCM enables +one-step generation and inversion while also allowing the use of additional +steps to enhance generation quality or reduce reconstruction error. We further +showcase BCM's capability in downstream tasks, such as interpolation and +inpainting. Our code and weights are available at +https://github.com/Mosasaur5526/BCM-iCT-torch. + +
+
+ comment: 39 pages, 27 figures; a shorter version of this paper was acceppted + at the ICML 2024 Workshop on Structured Probabilistic Inference & Generative + Modeling +
+
+
+
+
+ + ♻ ☆ Video-Foley: Two-Stage Video-To-Sound Generation via Temporal Event + Condition For Foley Sound + + +
+ Foley sound synthesis is crucial for multimedia production, enhancing user +experience by synchronizing audio and video both temporally and semantically. +Recent studies on automating this labor-intensive process through +video-to-sound generation face significant challenges. Systems lacking explicit +temporal features suffer from poor alignment and controllability, while +timestamp-based models require costly and subjective human annotation. We +propose Video-Foley, a video-to-sound system using Root Mean Square (RMS) as an +intuitive condition with semantic timbre prompts (audio or text). RMS, a +frame-level intensity envelope closely related to audio semantics, acts as a +temporal event feature to guide audio generation from video. The +annotation-free self-supervised learning framework consists of two stages, +Video2RMS and RMS2Sound, incorporating novel ideas including RMS discretization +and RMS-ControlNet with a pretrained text-to-audio model. Our extensive +evaluation shows that Video-Foley achieves state-of-the-art performance in +audio-visual alignment and controllability for sound timing, intensity, timbre, +and nuance. Source code, model weights and demos are available on our companion +website. (https://jnwnlee.github.io/video-foley-demo) + +
+
+
+
+
+ + ♻ ☆ LLaVA-Mini: Efficient Image and Video Large Multimodal Models with One + Vision Token ICLR 2025 + + +
+ The advent of real-time large multimodal models (LMMs) like GPT-4o has +sparked considerable interest in efficient LMMs. LMM frameworks typically +encode visual inputs into vision tokens (continuous representations) and +integrate them and textual instructions into the context of large language +models (LLMs), where large-scale parameters and numerous context tokens +(predominantly vision tokens) result in substantial computational overhead. +Previous efforts towards efficient LMMs always focus on replacing the LLM +backbone with smaller models, while neglecting the crucial issue of token +quantity. In this paper, we introduce LLaVA-Mini, an efficient LMM with minimal +vision tokens. To achieve a high compression ratio of vision tokens while +preserving visual information, we first analyze how LMMs understand vision +tokens and find that most vision tokens only play a crucial role in the early +layers of LLM backbone, where they mainly fuse visual information into text +tokens. Building on this finding, LLaVA-Mini introduces modality pre-fusion to +fuse visual information into text tokens in advance, thereby facilitating the +extreme compression of vision tokens fed to LLM backbone into one token. +LLaVA-Mini is a unified large multimodal model that can support the +understanding of images, high-resolution images, and videos in an efficient +manner. Experiments across 11 image-based and 7 video-based benchmarks +demonstrate that LLaVA-Mini outperforms LLaVA-v1.5 with just 1 vision token +instead of 576. Efficiency analyses reveal that LLaVA-Mini can reduce FLOPs by +77%, deliver low-latency responses within 40 milliseconds, and process over +10,000 frames of video on the GPU hardware with 24GB of memory. + +
+
+ comment: Accepted to ICLR 2025. Code: https://github.com/ictnlp/LLaVA-Mini + Model: https://huggingface.co/ICTNLP/llava-mini-llama-3.1-8b +
+
+
+
+
+ + ♻ ☆ Drag Your Gaussian: Effective Drag-Based Editing with Score Distillation + for 3D Gaussian Splatting + + +
+ Recent advancements in 3D scene editing have been propelled by the rapid +development of generative models. Existing methods typically utilize generative +models to perform text-guided editing on 3D representations, such as 3D +Gaussian Splatting (3DGS). However, these methods are often limited to texture +modifications and fail when addressing geometric changes, such as editing a +character's head to turn around. Moreover, such methods lack accurate control +over the spatial position of editing results, as language struggles to +precisely describe the extent of edits. To overcome these limitations, we +introduce DYG, an effective 3D drag-based editing method for 3D Gaussian +Splatting. It enables users to conveniently specify the desired editing region +and the desired dragging direction through the input of 3D masks and pairs of +control points, thereby enabling precise control over the extent of editing. +DYG integrates the strengths of the implicit triplane representation to +establish the geometric scaffold of the editing results, effectively overcoming +suboptimal editing outcomes caused by the sparsity of 3DGS in the desired +editing regions. Additionally, we incorporate a drag-based Latent Diffusion +Model into our method through the proposed Drag-SDS loss function, enabling +flexible, multi-view consistent, and fine-grained editing. Extensive +experiments demonstrate that DYG conducts effective drag-based editing guided +by control point prompts, surpassing other baselines in terms of editing effect +and quality, both qualitatively and quantitatively. Visit our project page at +https://quyans.github.io/Drag-Your-Gaussian. + +
+
+ comment: Visit our project page at https://quyans.github.io/Drag-Your-Gaussian +
+
+
+
+
+ + ♻ ☆ Audio-Visual Instance Segmentation CVPR 2025 + + +
+ In this paper, we propose a new multi-modal task, termed audio-visual +instance segmentation (AVIS), which aims to simultaneously identify, segment +and track individual sounding object instances in audible videos. To facilitate +this research, we introduce a high-quality benchmark named AVISeg, containing +over 90K instance masks from 26 semantic categories in 926 long videos. +Additionally, we propose a strong baseline model for this task. Our model first +localizes sound source within each frame, and condenses object-specific +contexts into concise tokens. Then it builds long-range audio-visual +dependencies between these tokens using window-based attention, and tracks +sounding objects among the entire video sequences. Extensive experiments reveal +that our method performs best on AVISeg, surpassing the existing methods from +related tasks. We further conduct the evaluation on several multi-modal large +models. Unfortunately, they exhibits subpar performance on instance-level sound +source localization and temporal perception. We expect that AVIS will inspire +the community towards a more comprehensive multi-modal understanding. Dataset +and code is available at https://github.com/ruohaoguo/avis. + +
+
+ comment: Accepted by CVPR 2025 +
+
+
+
+
+ + ♻ ☆ Evaluating Low-Resource Lane Following Algorithms for + Compute-Constrained Automated Vehicles + + +
+ Reliable lane-following is essential for automated and assisted driving, yet +existing solutions often rely on models that require extensive computational +resources, limiting their deployment in compute-constrained vehicles. We +evaluate five low-resource lane-following algorithms designed for real-time +operation on vehicles with limited computing resources. Performance was +assessed through simulation and deployment on real drive-by-wire electric +vehicles, with evaluation metrics including reliability, comfort, speed, and +adaptability. The top-performing methods used unsupervised learning to detect +and separate lane lines with processing time under 10 ms per frame, +outperforming compute-intensive and poor generalizing deep learning approaches. +These approaches demonstrated robustness across lighting conditions, road +textures, and lane geometries. The findings highlight the potential for +efficient lane detection approaches to enhance the accessibility and +reliability of autonomous vehicle technologies. Reducing computing requirements +enables lane keeping to be widely deployed in vehicles as part of lower-level +automation, including active safety systems. + +
+
+ comment: Supported by the National Science Foundation under Grants No. 2150292 + and 2150096 +
+
+
+
+
+ + ♻ ☆ SPARTUN3D: Situated Spatial Understanding of 3D World in Large Language + Models + + +
+ Integrating the 3D world into large language models (3D-based LLMs) has been +a promising research direction for 3D scene understanding. However, current +3D-based LLMs fall short in situated understanding due to two key limitations: +1) existing 3D datasets are constructed from a global perspective of the 3D +scenes and lack situated context. 2) the architectures of existing 3D-based +LLMs lack explicit alignment between the spatial representations of 3D scenes +and natural language, limiting their performance in tasks requiring precise +spatial reasoning. We address these issues by introducing a scalable situated +3D dataset, named Spartun3D, that incorporates various situated spatial +reasoning tasks. Furthermore, we propose Spartun3D-LLM, built on an existing +3D-based LLM but integrated with a novel situated spatial alignment module, +aiming to enhance the alignment between 3D visual representations and their +corresponding textual descriptions. Experimental results demonstrate that both +our proposed dataset and alignment module significantly enhance the situated +spatial understanding of 3D-based LLMs. + +
+
+
+
+
+ + ♻ ☆ HMD^2: Environment-aware Motion Generation from Single Egocentric + Head-Mounted Device 3DV 2025 + + +
+ This paper investigates the generation of realistic full-body human motion +using a single head-mounted device with an outward-facing color camera and the +ability to perform visual SLAM. To address the ambiguity of this setup, we +present HMD^2, a novel system that balances motion reconstruction and +generation. From a reconstruction standpoint, it aims to maximally utilize the +camera streams to produce both analytical and learned features, including head +motion, SLAM point cloud, and image embeddings. On the generative front, HMD^2 +employs a multi-modal conditional motion diffusion model with a Transformer +backbone to maintain temporal coherence of generated motions, and utilizes +autoregressive inpainting to facilitate online motion inference with minimal +latency (0.17 seconds). We show that our system provides an effective and +robust solution that scales to a diverse dataset of over 200 hours of motion in +complex indoor and outdoor environments. + +
+
+ comment: International Conference on 3D Vision 2025 (3DV 2025) +
+
+
+
+
+ + ♻ ☆ Rethinking Audio-Visual Adversarial Vulnerability from Temporal and + Modality Perspectives ICLR 2025 + + +
+ While audio-visual learning equips models with a richer understanding of the +real world by leveraging multiple sensory modalities, this integration also +introduces new vulnerabilities to adversarial attacks. + In this paper, we present a comprehensive study of the adversarial robustness +of audio-visual models, considering both temporal and modality-specific +vulnerabilities. We propose two powerful adversarial attacks: 1) a temporal +invariance attack that exploits the inherent temporal redundancy across +consecutive time segments and 2) a modality misalignment attack that introduces +incongruence between the audio and visual modalities. These attacks are +designed to thoroughly assess the robustness of audio-visual models against +diverse threats. Furthermore, to defend against such attacks, we introduce a +novel audio-visual adversarial training framework. This framework addresses key +challenges in vanilla adversarial training by incorporating efficient +adversarial perturbation crafting tailored to multi-modal data and an +adversarial curriculum strategy. Extensive experiments in the Kinetics-Sounds +dataset demonstrate that our proposed temporal and modality-based attacks in +degrading model performance can achieve state-of-the-art performance, while our +adversarial training defense largely improves the adversarial robustness as +well as the adversarial training efficiency. + +
+
+ comment: Accepted by ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Tracking objects that change in appearance with phase synchrony + + +
+ Objects we encounter often change appearance as we interact with them. +Changes in illumination (shadows), object pose, or the movement of non-rigid +objects can drastically alter available image features. How do biological +visual systems track objects as they change? One plausible mechanism involves +attentional mechanisms for reasoning about the locations of objects +independently of their appearances -- a capability that prominent neuroscience +theories have associated with computing through neural synchrony. Here, we +describe a novel deep learning circuit that can learn to precisely control +attention to features separately from their location in the world through +neural synchrony: the complex-valued recurrent neural network (CV-RNN). Next, +we compare object tracking in humans, the CV-RNN, and other deep neural +networks (DNNs), using FeatureTracker: a large-scale challenge that asks +observers to track objects as their locations and appearances change in +precisely controlled ways. While humans effortlessly solved FeatureTracker, +state-of-the-art DNNs did not. In contrast, our CV-RNN behaved similarly to +humans on the challenge, providing a computational proof-of-concept for the +role of phase synchronization as a neural substrate for tracking +appearance-morphing objects as they move about. + +
+
+
+
+
+ + ♻ ☆ A Dual-Purpose Framework for Backdoor Defense and Backdoor Amplification + in Diffusion Models + + +
+ Diffusion models have emerged as state-of-the-art generative frameworks, +excelling in producing high-quality multi-modal samples. However, recent +studies have revealed their vulnerability to backdoor attacks, where backdoored +models generate specific, undesirable outputs called backdoor target (e.g., +harmful images) when a pre-defined trigger is embedded to their inputs. In this +paper, we propose PureDiffusion, a dual-purpose framework that simultaneously +serves two contrasting roles: backdoor defense and backdoor attack +amplification. For defense, we introduce two novel loss functions to invert +backdoor triggers embedded in diffusion models. The first leverages +trigger-induced distribution shifts across multiple timesteps of the diffusion +process, while the second exploits the denoising consistency effect when a +backdoor is activated. Once an accurate trigger inversion is achieved, we +develop a backdoor detection method that analyzes both the inverted trigger and +the generated backdoor targets to identify backdoor attacks. In terms of attack +amplification with the role of an attacker, we describe how our trigger +inversion algorithm can be used to reinforce the original trigger embedded in +the backdoored diffusion model. This significantly boosts attack performance +while reducing the required backdoor training time. Experimental results +demonstrate that PureDiffusion achieves near-perfect detection accuracy, +outperforming existing defenses by a large margin, particularly against complex +trigger patterns. Additionally, in an attack scenario, our attack amplification +approach elevates the attack success rate (ASR) of existing backdoor attacks to +nearly 100\% while reducing training time by up to 20x. + +
+
+
+
+
+ + ♻ ☆ Test-Time Adaptation for Combating Missing Modalities in Egocentric + Videos + + +
+ Understanding videos that contain multiple modalities is crucial, especially +in egocentric videos, where combining various sensory inputs significantly +improves tasks like action recognition and moment localization. However, +real-world applications often face challenges with incomplete modalities due to +privacy concerns, efficiency needs, or hardware issues. Current methods, while +effective, often necessitate retraining the model entirely to handle missing +modalities, making them computationally intensive, particularly with large +training datasets. In this study, we propose a novel approach to address this +issue at test time without requiring retraining. We frame the problem as a +test-time adaptation task, where the model adjusts to the available unlabeled +data at test time. Our method, MiDl~(Mutual information with +self-Distillation), encourages the model to be insensitive to the specific +modality source present during testing by minimizing the mutual information +between the prediction and the available modality. Additionally, we incorporate +self-distillation to maintain the model's original performance when both +modalities are available. MiDl represents the first self-supervised, online +solution for handling missing modalities exclusively at test time. Through +experiments with various pretrained models and datasets, MiDl demonstrates +substantial performance improvement without the need for retraining. + +
+
+
+
+
+ + ♻ ☆ DIPSER: A Dataset for In-Person Student Engagement Recognition in the + Wild + + +
+ In this paper, a novel dataset is introduced, designed to assess student +attention within in-person classroom settings. This dataset encompasses RGB +camera data, featuring multiple cameras per student to capture both posture and +facial expressions, in addition to smartwatch sensor data for each individual. +This dataset allows machine learning algorithms to be trained to predict +attention and correlate it with emotion. A comprehensive suite of attention and +emotion labels for each student is provided, generated through self-reporting +as well as evaluations by four different experts. Our dataset uniquely combines +facial and environmental camera data, smartwatch metrics, and includes +underrepresented ethnicities in similar datasets, all within in-the-wild, +in-person settings, making it the most comprehensive dataset of its kind +currently available. + The dataset presented offers an extensive and diverse collection of data +pertaining to student interactions across different educational contexts, +augmented with additional metadata from other tools. This initiative addresses +existing deficiencies by offering a valuable resource for the analysis of +student attention and emotion in face-to-face lessons. + +
+
+
+
+
+ + ♻ ☆ MOVE: Effective and Harmless Ownership Verification via Embedded + External Features AAAI 2022 + + +
+ Currently, deep neural networks (DNNs) are widely adopted in different +applications. Despite its commercial values, training a well-performing DNN is +resource-consuming. Accordingly, the well-trained model is valuable +intellectual property for its owner. However, recent studies revealed the +threats of model stealing, where the adversaries can obtain a function-similar +copy of the victim model, even when they can only query the model. In this +paper, we propose an effective and harmless model ownership verification (MOVE) +to defend against different types of model stealing simultaneously, without +introducing new security risks. In general, we conduct the ownership +verification by verifying whether a suspicious model contains the knowledge of +defender-specified external features. Specifically, we embed the external +features by modifying a few training samples with style transfer. We then train +a meta-classifier to determine whether a model is stolen from the victim. This +approach is inspired by the understanding that the stolen models should contain +the knowledge of features learned by the victim model. In particular, +\revision{we develop our MOVE method under both white-box and black-box +settings and analyze its theoretical foundation to provide comprehensive model +protection.} Extensive experiments on benchmark datasets verify the +effectiveness of our method and its resistance to potential adaptive attacks. +The codes for reproducing the main experiments of our method are available at +https://github.com/THUYimingLi/MOVE. + +
+
+ comment: This paper has been accepted by IEEE TPAMI 2025. It is the journal + extension of our conference paper in AAAI 2022 + (https://ojs.aaai.org/index.php/AAAI/article/view/20036). 18 pages +
+
+
+
+
+ + ♻ ☆ K-LoRA: Unlocking Training-Free Fusion of Any Subject and Style LoRAs CVPR 2025 + + +
+ Recent studies have explored combining different LoRAs to jointly generate +learned style and content. However, existing methods either fail to effectively +preserve both the original subject and style simultaneously or require +additional training. In this paper, we argue that the intrinsic properties of +LoRA can effectively guide diffusion models in merging learned subject and +style. Building on this insight, we propose K-LoRA, a simple yet effective +training-free LoRA fusion approach. In each attention layer, K-LoRA compares +the Top-K elements in each LoRA to be fused, determining which LoRA to select +for optimal fusion. This selection mechanism ensures that the most +representative features of both subject and style are retained during the +fusion process, effectively balancing their contributions. Experimental results +demonstrate that the proposed method effectively integrates the subject and +style information learned by the original LoRAs, outperforming state-of-the-art +training-based approaches in both qualitative and quantitative results. + +
+
+ comment: CVPR 2025, Project page: https://k-lora.github.io/K-LoRA.io/ +
+
+
+
+
+ + ♻ ☆ Intrinsic Dimension Correlation: uncovering nonlinear connections in + multimodal representations ICLR 2025 + + +
+ To gain insight into the mechanisms behind machine learning methods, it is +crucial to establish connections among the features describing data points. +However, these correlations often exhibit a high-dimensional and strongly +nonlinear nature, which makes them challenging to detect using standard +methods. This paper exploits the entanglement between intrinsic dimensionality +and correlation to propose a metric that quantifies the (potentially nonlinear) +correlation between high-dimensional manifolds. We first validate our method on +synthetic data in controlled environments, showcasing its advantages and +drawbacks compared to existing techniques. Subsequently, we extend our analysis +to large-scale applications in neural network representations. Specifically, we +focus on latent representations of multimodal data, uncovering clear +correlations between paired visual and textual embeddings, whereas existing +methods struggle significantly in detecting similarity. Our results indicate +the presence of highly nonlinear correlation patterns between latent manifolds. + +
+
+ comment: Accepted at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Q-Bench-Video: Benchmarking the Video Quality Understanding of LMMs + + +
+ With the rising interest in research on Large Multi-modal Models (LMMs) for +video understanding, many studies have emphasized general video comprehension +capabilities, neglecting the systematic exploration into video quality +understanding. To address this oversight, we introduce Q-Bench-Video in this +paper, a new benchmark specifically designed to evaluate LMMs' proficiency in +discerning video quality. a) To ensure video source diversity, Q-Bench-Video +encompasses videos from natural scenes, AI-generated Content (AIGC), and +Computer Graphics (CG). b) Building on the traditional multiple-choice +questions format with the Yes-or-No and What-How categories, we include +Open-ended questions to better evaluate complex scenarios. Additionally, we +incorporate the video pair quality comparison question to enhance +comprehensiveness. c) Beyond the traditional Technical, Aesthetic, and Temporal +distortions, we have expanded our evaluation aspects to include the dimension +of AIGC distortions, which addresses the increasing demand for video +generation. Finally, we collect a total of 2,378 question-answer pairs and test +them on 12 open-source & 5 proprietary LMMs. Our findings indicate that while +LMMs have a foundational understanding of video quality, their performance +remains incomplete and imprecise, with a notable discrepancy compared to human +performance. Through Q-Bench-Video, we seek to catalyze community interest, +stimulate further research, and unlock the untapped potential of LMMs to close +the gap in video quality understanding. + +
+
+
+
+
+ + ♻ ☆ HyperFace: Generating Synthetic Face Recognition Datasets by Exploring + Face Embedding Hypersphere ICLR 2025 + + +
+ Face recognition datasets are often collected by crawling Internet and +without individuals' consents, raising ethical and privacy concerns. Generating +synthetic datasets for training face recognition models has emerged as a +promising alternative. However, the generation of synthetic datasets remains +challenging as it entails adequate inter-class and intra-class variations. +While advances in generative models have made it easier to increase intra-class +variations in face datasets (such as pose, illumination, etc.), generating +sufficient inter-class variation is still a difficult task. In this paper, we +formulate the dataset generation as a packing problem on the embedding space +(represented on a hypersphere) of a face recognition model and propose a new +synthetic dataset generation approach, called HyperFace. We formalize our +packing problem as an optimization problem and solve it with a gradient +descent-based approach. Then, we use a conditional face generator model to +synthesize face images from the optimized embeddings. We use our generated +datasets to train face recognition models and evaluate the trained models on +several benchmarking real datasets. Our experimental results show that models +trained with HyperFace achieve state-of-the-art performance in training face +recognition using synthetic datasets. + +
+
+ comment: Accepted in ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Improved Baselines with Synchronized Encoding for Universal Medical + Image Segmentation + + +
+ Large foundation models, known for their strong zero-shot generalization +capabilities, can be applied to a wide range of downstream tasks. However, +developing foundation models for medical image segmentation poses a significant +challenge due to the domain gap between natural and medical images. While +fine-tuning techniques based on the Segment Anything Model (SAM) have been +explored, they primarily focus on scaling up data or refining inference +strategies without incorporating domain-specific architectural designs, +limiting their zero-shot performance. To optimize segmentation performance +under standard inference settings and provide a strong baseline for future +research, we introduce SyncSAM, which employs a synchronized dual-branch +encoder that integrates convolution and Transformer features in a synchronized +manner to enhance medical image encoding, and a multi-scale dual-branch decoder +to preserve image details. SyncSAM is trained on two of the largest medical +image segmentation datasets, SA-Med2D-20M and IMed-361M, resulting in a series +of pre-trained models for universal medical image segmentation. Experimental +results demonstrate that SyncSAM not only achieves state-of-the-art performance +on test sets but also exhibits strong zero-shot capabilities on unseen +datasets. The code and model weights are available at +https://github.com/Hhankyangg/SyncSAM. + +
+
+
+
+
+ + ♻ ☆ StochSync: Stochastic Diffusion Synchronization for Image Generation in + Arbitrary Spaces ICLR 2025 + + +
+ We propose a zero-shot method for generating images in arbitrary spaces +(e.g., a sphere for 360{\deg} panoramas and a mesh surface for texture) using a +pretrained image diffusion model. The zero-shot generation of various visual +content using a pretrained image diffusion model has been explored mainly in +two directions. First, Diffusion Synchronization-performing reverse diffusion +processes jointly across different projected spaces while synchronizing them in +the target space-generates high-quality outputs when enough conditioning is +provided, but it struggles in its absence. Second, Score Distillation +Sampling-gradually updating the target space data through gradient +descent-results in better coherence but often lacks detail. In this paper, we +reveal for the first time the interconnection between these two methods while +highlighting their differences. To this end, we propose StochSync, a novel +approach that combines the strengths of both, enabling effective performance +with weak conditioning. Our experiments demonstrate that StochSync provides the +best performance in 360{\deg} panorama generation (where image conditioning is +not given), outperforming previous finetuning-based methods, and also delivers +comparable results in 3D mesh texturing (where depth conditioning is provided) +with previous methods. + +
+
+ comment: Project page: https://stochsync.github.io/ (ICLR 2025) +
+
+
+
+
+ + ♻ ☆ DartControl: A Diffusion-Based Autoregressive Motion Model for Real-Time + Text-Driven Motion Control ICLR + + +
+ Text-conditioned human motion generation, which allows for user interaction +through natural language, has become increasingly popular. Existing methods +typically generate short, isolated motions based on a single input sentence. +However, human motions are continuous and can extend over long periods, +carrying rich semantics. Creating long, complex motions that precisely respond +to streams of text descriptions, particularly in an online and real-time +setting, remains a significant challenge. Furthermore, incorporating spatial +constraints into text-conditioned motion generation presents additional +challenges, as it requires aligning the motion semantics specified by text +descriptions with geometric information, such as goal locations and 3D scene +geometry. To address these limitations, we propose DartControl, in short DART, +a Diffusion-based Autoregressive motion primitive model for Real-time +Text-driven motion control. Our model effectively learns a compact motion +primitive space jointly conditioned on motion history and text inputs using +latent diffusion models. By autoregressively generating motion primitives based +on the preceding history and current text input, DART enables real-time, +sequential motion generation driven by natural language descriptions. +Additionally, the learned motion primitive space allows for precise spatial +motion control, which we formulate either as a latent noise optimization +problem or as a Markov decision process addressed through reinforcement +learning. We present effective algorithms for both approaches, demonstrating +our model's versatility and superior performance in various motion synthesis +tasks. Experiments show our method outperforms existing baselines in motion +realism, efficiency, and controllability. Video results are available on the +project page: https://zkf1997.github.io/DART/. + +
+
+ comment: Updated ICLR camera ready version +
+
+
+
+
+ + ♻ ☆ CogCoM: A Visual Language Model with Chain-of-Manipulations Reasoning + + +
+ Vision-Language Models (VLMs) have demonstrated their broad effectiveness +thanks to extensive training in aligning visual instructions to responses. +However, such training of conclusive alignment leads models to ignore essential +visual reasoning, further resulting in failures in meticulous visual problems +and unfaithful responses. Drawing inspiration from human cognition in solving +visual problems (e.g., marking, zoom in), this paper introduces Chain of +Manipulations, a mechanism that enables VLMs to solve problems step-by-step +with evidence. After training, models can solve various visual problems by +eliciting intrinsic manipulations (e.g., grounding, zoom in) with results +(e.g., boxes, image) actively without involving external tools, while also +allowing users to trace error causes. We study the roadmap to implement this +mechanism, including (1) a flexible design of manipulations upon extensive +analysis, (2) an efficient automated data generation pipeline, (3) a compatible +VLM architecture capable of multi-turn multi-image, and (4) a model training +process for versatile capabilities. With the design, we also manually annotate +6K high-quality samples for the challenging graphical mathematical problems. +Our trained model, \textbf{CogCoM}, equipped with this mechanism with 17B +parameters achieves state-of-the-art performance across 9 benchmarks from 4 +categories, demonstrating the effectiveness while preserving the +interpretability. Our code, model weights, and collected data are publicly +available at https://github.com/THUDM/CogCoM. + +
+
+ comment: 21 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ CLIPure: Purification in Latent Space via CLIP for Adversarially Robust + Zero-Shot Classification ICLR 2025 + + +
+ In this paper, we aim to build an adversarially robust zero-shot image +classifier. We ground our work on CLIP, a vision-language pre-trained encoder +model that can perform zero-shot classification by matching an image with text +prompts ``a photo of a .''. Purification is the path we choose +since it does not require adversarial training on specific attack types and +thus can cope with any foreseen attacks. We then formulate purification risk as +the KL divergence between the joint distributions of the purification process +of denoising the adversarial samples and the attack process of adding +perturbations to benign samples, through bidirectional Stochastic Differential +Equations (SDEs). The final derived results inspire us to explore purification +in the multi-modal latent space of CLIP. We propose two variants for our +CLIPure approach: CLIPure-Diff which models the likelihood of images' latent +vectors with the DiffusionPrior module in DaLLE-2 (modeling the generation +process of CLIP's latent vectors), and CLIPure-Cos which models the likelihood +with the cosine similarity between the embeddings of an image and ``a photo of +a.''. As far as we know, CLIPure is the first purification method in +multi-modal latent space and CLIPure-Cos is the first purification method that +is not based on generative models, which substantially improves defense +efficiency. We conducted extensive experiments on CIFAR-10, ImageNet, and 13 +datasets that previous CLIP-based defense methods used for evaluating zero-shot +classification robustness. Results show that CLIPure boosts the SOTA robustness +by a large margin, e.g., from 71.7% to 91.1% on CIFAR10, from 59.6% to 72.6% on +ImageNet, and 108% relative improvements of average robustness on the 13 +datasets over previous SOTA. The code is available at +https://github.com/TMLResearchGroup-CAS/CLIPure. + +
+
+ comment: accepted by ICLR 2025 +
+
+
+
+
+ + ♻ ☆ L-WISE: Boosting Human Visual Category Learning Through Model-Based + Image Selection And Enhancement + + +
+ The currently leading artificial neural network models of the visual ventral +stream - which are derived from a combination of performance optimization and +robustification methods - have demonstrated a remarkable degree of behavioral +alignment with humans on visual categorization tasks. We show that image +perturbations generated by these models can enhance the ability of humans to +accurately report the ground truth class. Furthermore, we find that the same +models can also be used out-of-the-box to predict the proportion of correct +human responses to individual images, providing a simple, human-aligned +estimator of the relative difficulty of each image. Motivated by these +observations, we propose to augment visual learning in humans in a way that +improves human categorization accuracy at test time. Our learning augmentation +approach consists of (i) selecting images based on their model-estimated +recognition difficulty, and (ii) applying image perturbations that aid +recognition for novice learners. We find that combining these model-based +strategies leads to categorization accuracy gains of 33-72% relative to control +subjects without these interventions, on unmodified, randomly selected held-out +test images. Beyond the accuracy gain, the training time for the augmented +learning group was also shortened by 20-23%, despite both groups completing the +same number of training trials. We demonstrate the efficacy of our approach in +a fine-grained categorization task with natural images, as well as two tasks in +clinically relevant image domains - histology and dermoscopy - where visual +learning is notoriously challenging. To the best of our knowledge, our work is +the first application of artificial neural networks to increase visual learning +performance in humans by enhancing category-specific image features. + +
+
+
+
+
+ + ♻ ☆ Padding Tone: A Mechanistic Analysis of Padding Tokens in T2I Models NAACL 2025 + + +
+ Text-to-image (T2I) diffusion models rely on encoded prompts to guide the +image generation process. Typically, these prompts are extended to a fixed +length by adding padding tokens before text encoding. Despite being a default +practice, the influence of padding tokens on the image generation process has +not been investigated. In this work, we conduct the first in-depth analysis of +the role padding tokens play in T2I models. We develop two causal techniques to +analyze how information is encoded in the representation of tokens across +different components of the T2I pipeline. Using these techniques, we +investigate when and how padding tokens impact the image generation process. +Our findings reveal three distinct scenarios: padding tokens may affect the +model's output during text encoding, during the diffusion process, or be +effectively ignored. Moreover, we identify key relationships between these +scenarios and the model's architecture (cross or self-attention) and its +training process (frozen or trained text encoder). These insights contribute to +a deeper understanding of the mechanisms of padding tokens, potentially +informing future model design and training practices in T2I systems. + +
+
+ comment: Published in: NAACL 2025. Project webpage: + https://padding-tone.github.io/ +
+
+
+
+
+ + ♻ ☆ Pair-VPR: Place-Aware Pre-training and Contrastive Pair Classification + for Visual Place Recognition with Vision Transformers + + +
+ In this work we propose a novel joint training method for Visual Place +Recognition (VPR), which simultaneously learns a global descriptor and a pair +classifier for re-ranking. The pair classifier can predict whether a given pair +of images are from the same place or not. The network only comprises Vision +Transformer components for both the encoder and the pair classifier, and both +components are trained using their respective class tokens. In existing VPR +methods, typically the network is initialized using pre-trained weights from a +generic image dataset such as ImageNet. In this work we propose an alternative +pre-training strategy, by using Siamese Masked Image Modelling as a +pre-training task. We propose a Place-aware image sampling procedure from a +collection of large VPR datasets for pre-training our model, to learn visual +features tuned specifically for VPR. By re-using the Mask Image Modelling +encoder and decoder weights in the second stage of training, Pair-VPR can +achieve state-of-the-art VPR performance across five benchmark datasets with a +ViT-B encoder, along with further improvements in localization recall with +larger encoders. The Pair-VPR website is: +https://csiro-robotics.github.io/Pair-VPR. + +
+
+
+
+
+ + ♻ ☆ WalnutData: A UAV Remote Sensing Dataset of Green Walnuts and Model + Evaluation + + +
+ The UAV technology is gradually maturing and can provide extremely powerful +support for smart agriculture and precise monitoring. Currently, there is no +dataset related to green walnuts in the field of agricultural computer vision. +Thus, in order to promote the algorithm design in the field of agricultural +computer vision, we used UAV to collect remote-sensing data from 8 walnut +sample plots. Considering that green walnuts are subject to various lighting +conditions and occlusion, we constructed a large-scale dataset with a +higher-granularity of target features - WalnutData. This dataset contains a +total of 30,240 images and 706,208 instances, and there are 4 target +categories: being illuminated by frontal light and unoccluded (A1), being +backlit and unoccluded (A2), being illuminated by frontal light and occluded +(B1), and being backlit and occluded (B2). Subsequently, we evaluated many +mainstream algorithms on WalnutData and used these evaluation results as the +baseline standard. The dataset and all evaluation results can be obtained at +https://github.com/1wuming/WalnutData. + +
+
+
+
+
+ + ♻ ☆ High-Resolution Image Synthesis via Next-Token Prediction + + +
+ Recently, autoregressive models have demonstrated remarkable performance in +class-conditional image generation. However, the application of next-token +prediction to high-resolution text-to-image generation remains largely +unexplored. In this paper, we introduce \textbf{D-JEPA$\cdot$T2I}, an +autoregressive model based on continuous tokens that incorporates innovations +in both architecture and training strategy to generate high-quality, +photorealistic images at arbitrary resolutions, up to 4K. Architecturally, we +adopt the denoising joint embedding predictive architecture (D-JEPA) while +leveraging a multimodal visual transformer to effectively integrate textual and +visual features. Additionally, we introduce flow matching loss alongside the +proposed Visual Rotary Positional Embedding (VoPE) to enable continuous +resolution learning. In terms of training strategy, we propose a data feedback +mechanism that dynamically adjusts the sampling procedure based on statistical +analysis and an online learning critic model. This encourages the model to move +beyond its comfort zone, reducing redundant training on well-mastered scenarios +and compelling it to address more challenging cases with suboptimal generation +quality. For the first time, we achieve state-of-the-art high-resolution image +synthesis via next-token prediction. + +
+
+ comment: 31 pages +
+
+
+
+
+ + ♻ ☆ ET-SEED: Efficient Trajectory-Level SE(3) Equivariant Diffusion Policy ICLR 2025 + + +
+ Imitation learning, e.g., diffusion policy, has been proven effective in +various robotic manipulation tasks. However, extensive demonstrations are +required for policy robustness and generalization. To reduce the demonstration +reliance, we leverage spatial symmetry and propose ET-SEED, an efficient +trajectory-level SE(3) equivariant diffusion model for generating action +sequences in complex robot manipulation tasks. Further, previous equivariant +diffusion models require the per-step equivariance in the Markov process, +making it difficult to learn policy under such strong constraints. We +theoretically extend equivariant Markov kernels and simplify the condition of +equivariant diffusion process, thereby significantly improving training +efficiency for trajectory-level SE(3) equivariant diffusion policy in an +end-to-end manner. We evaluate ET-SEED on representative robotic manipulation +tasks, involving rigid body, articulated and deformable object. Experiments +demonstrate superior data efficiency and manipulation proficiency of our +proposed method, as well as its ability to generalize to unseen configurations +with only a few demonstrations. Website: https://et-seed.github.io/ + +
+
+ comment: Accept to ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Autoregressive Video Generation without Vector Quantization ICLR 2025 + + +
+ This paper presents a novel approach that enables autoregressive video +generation with high efficiency. We propose to reformulate the video generation +problem as a non-quantized autoregressive modeling of temporal frame-by-frame +prediction and spatial set-by-set prediction. Unlike raster-scan prediction in +prior autoregressive models or joint distribution modeling of fixed-length +tokens in diffusion models, our approach maintains the causal property of +GPT-style models for flexible in-context capabilities, while leveraging +bidirectional modeling within individual frames for efficiency. With the +proposed approach, we train a novel video autoregressive model without vector +quantization, termed NOVA. Our results demonstrate that NOVA surpasses prior +autoregressive video models in data efficiency, inference speed, visual +fidelity, and video fluency, even with a much smaller model capacity, i.e., +0.6B parameters. NOVA also outperforms state-of-the-art image diffusion models +in text-to-image generation tasks, with a significantly lower training cost. +Additionally, NOVA generalizes well across extended video durations and enables +diverse zero-shot applications in one unified model. Code and models are +publicly available at https://github.com/baaivision/NOVA. + +
+
+ comment: Accepted to ICLR 2025. Project page at + https://github.com/baaivision/NOVA +
+
+
+
+
+ + ♻ ☆ SEED-X: Multimodal Models with Unified Multi-granularity Comprehension + and Generation + + +
+ The rapid evolution of multimodal foundation model has demonstrated +significant progresses in vision-language understanding and generation, e.g., +our previous work SEED-LLaMA. However, there remains a gap between its +capability and the real-world applicability, primarily due to the model's +limited capacity to effectively respond to various user instructions and +interact with diverse visual data. In this work, we focus on bridging this gap +through integrating two enhanced features: (1) comprehending images of +arbitrary sizes and ratios, and (2) enabling multi-granularity image +generation. We present a unified and versatile foundation model, namely, +SEED-X, which is able to model multi-granularity visual semantics for +comprehension and generation tasks. Besides the competitive results on public +benchmarks, SEED-X demonstrates its effectiveness in handling real-world +applications across various domains after instruction tuning. We hope that our +work will inspire future research into what can be achieved by versatile +multimodal foundation models in real-world applications. The models, codes, and +datasets are released in https://github.com/AILab-CVC/SEED-X. + +
+
+ comment: We added benchmark results (without updating models) and ablation + study in this version. Project released at: + https://github.com/AILab-CVC/SEED-X +
+
+
+
+
+ + ♻ ☆ Predictive Uncertainty Quantification for Bird's Eye View Segmentation: + A Benchmark and Novel Loss Function ICLR 2025 + + +
+ The fusion of raw sensor data to create a Bird's Eye View (BEV) +representation is critical for autonomous vehicle planning and control. Despite +the growing interest in using deep learning models for BEV semantic +segmentation, anticipating segmentation errors and enhancing the explainability +of these models remain underexplored. This paper introduces a comprehensive +benchmark for predictive uncertainty quantification in BEV segmentation, +evaluating multiple uncertainty quantification methods across three popular +datasets with three representative network architectures. Our study focuses on +the effectiveness of quantified uncertainty in detecting misclassified and +out-of-distribution (OOD) pixels while also improving model calibration. +Through empirical analysis, we uncover challenges in existing uncertainty +quantification methods and demonstrate the potential of evidential deep +learning techniques, which capture both aleatoric and epistemic uncertainty. To +address these challenges, we propose a novel loss function, +Uncertainty-Focal-Cross-Entropy (UFCE), specifically designed for highly +imbalanced data, along with a simple uncertainty-scaling regularization term +that improves both uncertainty quantification and model calibration for BEV +segmentation. + +
+
+ comment: ICLR 2025 +
+
+
+
+
+ + ♻ ☆ LANTERN: Accelerating Visual Autoregressive Models with Relaxed + Speculative Decoding ICLR 2025 + + +
+ Auto-Regressive (AR) models have recently gained prominence in image +generation, often matching or even surpassing the performance of diffusion +models. However, one major limitation of AR models is their sequential nature, +which processes tokens one at a time, slowing down generation compared to +models like GANs or diffusion-based methods that operate more efficiently. +While speculative decoding has proven effective for accelerating LLMs by +generating multiple tokens in a single forward, its application in visual AR +models remains largely unexplored. In this work, we identify a challenge in +this setting, which we term \textit{token selection ambiguity}, wherein visual +AR models frequently assign uniformly low probabilities to tokens, hampering +the performance of speculative decoding. To overcome this challenge, we propose +a relaxed acceptance condition referred to as LANTERN that leverages the +interchangeability of tokens in latent space. This relaxation restores the +effectiveness of speculative decoding in visual AR models by enabling more +flexible use of candidate tokens that would otherwise be prematurely rejected. +Furthermore, by incorporating a total variation distance bound, we ensure that +these speed gains are achieved without significantly compromising image quality +or semantic coherence. Experimental results demonstrate the efficacy of our +method in providing a substantial speed-up over speculative decoding. In +specific, compared to a na\"ive application of the state-of-the-art speculative +decoding, LANTERN increases speed-ups by $\mathbf{1.75}\times$ and +$\mathbf{1.82}\times$, as compared to greedy decoding and random sampling, +respectively, when applied to LlamaGen, a contemporary visual AR model. The +code is publicly available at https://github.com/jadohu/LANTERN. + +
+
+ comment: 30 pages, 13 figures, Accepted to ICLR 2025 (poster) +
+
+
+
+
+ + ♻ ☆ InterMask: 3D Human Interaction Generation via Collaborative Masked + Modeling + + +
+ Generating realistic 3D human-human interactions from textual descriptions +remains a challenging task. Existing approaches, typically based on diffusion +models, often produce results lacking realism and fidelity. In this work, we +introduce InterMask, a novel framework for generating human interactions using +collaborative masked modeling in discrete space. InterMask first employs a +VQ-VAE to transform each motion sequence into a 2D discrete motion token map. +Unlike traditional 1D VQ token maps, it better preserves fine-grained +spatio-temporal details and promotes spatial awareness within each token. +Building on this representation, InterMask utilizes a generative masked +modeling framework to collaboratively model the tokens of two interacting +individuals. This is achieved by employing a transformer architecture +specifically designed to capture complex spatio-temporal inter-dependencies. +During training, it randomly masks the motion tokens of both individuals and +learns to predict them. For inference, starting from fully masked sequences, it +progressively fills in the tokens for both individuals. With its enhanced +motion representation, dedicated architecture, and effective learning strategy, +InterMask achieves state-of-the-art results, producing high-fidelity and +diverse human interactions. It outperforms previous methods, achieving an FID +of $5.154$ (vs $5.535$ of in2IN) on the InterHuman dataset and $0.399$ (vs +$5.207$ of InterGen) on the InterX dataset. Additionally, InterMask seamlessly +supports reaction generation without the need for model redesign or +fine-tuning. + +
+
+ comment: Project webpage: https://gohar-malik.github.io/intermask +
+
+
+
+
+ + ♻ ☆ MoCoLSK: Modality Conditioned High-Resolution Downscaling for Land + Surface Temperature + + +
+ Land Surface Temperature (LST) is a critical parameter for environmental +studies, but directly obtaining high spatial resolution LST data remains +challenging due to the spatio-temporal trade-off in satellite remote sensing. +Guided LST downscaling has emerged as an alternative solution to overcome these +limitations, but current methods often neglect spatial non-stationarity, and +there is a lack of an open-source ecosystem for deep learning methods. In this +paper, we propose the Modality-Conditional Large Selective Kernel (MoCoLSK) +Network, a novel architecture that dynamically fuses multi-modal data through +modality-conditioned projections. MoCoLSK achieves a confluence of dynamic +receptive field adjustment and multi-modal feature fusion, leading to enhanced +LST prediction accuracy. Furthermore, we establish the GrokLST project, a +comprehensive open-source ecosystem featuring the GrokLST dataset, a +high-resolution benchmark, and the GrokLST toolkit, an open-source +PyTorch-based toolkit encapsulating MoCoLSK alongside 40+ state-of-the-art +approaches. Extensive experimental results validate MoCoLSK's effectiveness in +capturing complex dependencies and subtle variations within multispectral data, +outperforming existing methods in LST downscaling. Our code, dataset, and +toolkit are available at https://github.com/GrokCV/GrokLST. + +
+
+ comment: Accepted by IEEE TGRS +
+
+
+
+
+ + ♻ ☆ TEASER: Token Enhanced Spatial Modeling for Expressions Reconstruction ICLR 2025 + + +
+ 3D facial reconstruction from a single in-the-wild image is a crucial task in +human-centered computer vision tasks. While existing methods can recover +accurate facial shapes, there remains significant space for improvement in +fine-grained expression capture. Current approaches struggle with irregular +mouth shapes, exaggerated expressions, and asymmetrical facial movements. We +present TEASER (Token EnhAnced Spatial modeling for Expressions +Reconstruction), which addresses these challenges and enhances 3D facial +geometry performance. TEASER tackles two main limitations of existing methods: +insufficient photometric loss for self-reconstruction and inaccurate +localization of subtle expressions. We introduce a multi-scale tokenizer to +extract facial appearance information. Combined with a neural renderer, these +tokens provide precise geometric guidance for expression reconstruction. +Furthermore, TEASER incorporates a pose-dependent landmark loss to further +improve geometric performances. Our approach not only significantly enhances +expression reconstruction quality but also offers interpretable tokens suitable +for various downstream applications, such as photorealistic facial video +driving, expression transfer, and identity swapping. Quantitative and +qualitative experimental results across multiple datasets demonstrate that +TEASER achieves state-of-the-art performance in precise expression +reconstruction. + +
+
+ comment: Accepted by ICLR 2025, code and demos are available at + https://tinyurl.com/TEASER-project +
+
+
+
+
+ + ♻ ☆ Improving vision-language alignment with graph spiking hybrid Networks + + +
+ To bridge the semantic gap between vision and language (VL), it is necessary +to develop a good alignment strategy, which includes handling semantic +diversity, abstract representation of visual information, and generalization +ability of models. Recent works use detector-based bounding boxes or patches +with regular partitions to represent visual semantics. While current paradigms +have made strides, they are still insufficient for fully capturing the nuanced +contextual relations among various objects. This paper proposes a comprehensive +visual semantic representation module, necessitating the utilization of +panoptic segmentation to generate coherent fine-grained semantic features. +Furthermore, we propose a novel Graph Spiking Hybrid Network (GSHN) that +integrates the complementary advantages of Spiking Neural Networks (SNNs) and +Graph Attention Networks (GATs) to encode visual semantic information. +Intriguingly, the model not only encodes the discrete and continuous latent +variables of instances but also adeptly captures both local and global +contextual features, thereby significantly enhancing the richness and diversity +of semantic representations. Leveraging the spatiotemporal properties inherent +in SNNs, we employ contrastive learning (CL) to enhance the similarity-based +representation of embeddings. This strategy alleviates the computational +overhead of the model and enriches meaningful visual representations by +constructing positive and negative sample pairs. We design an innovative +pre-training method, Spiked Text Learning (STL), which uses text features to +improve the encoding ability of discrete semantics. Experiments show that the +proposed GSHN exhibits promising results on multiple VL downstream tasks. + +
+
+
+
+
+ + ♻ ☆ Improving Long-Text Alignment for Text-to-Image Diffusion Models + + +
+ The rapid advancement of text-to-image (T2I) diffusion models has enabled +them to generate unprecedented results from given texts. However, as text +inputs become longer, existing encoding methods like CLIP face limitations, and +aligning the generated images with long texts becomes challenging. To tackle +these issues, we propose LongAlign, which includes a segment-level encoding +method for processing long texts and a decomposed preference optimization +method for effective alignment training. For segment-level encoding, long texts +are divided into multiple segments and processed separately. This method +overcomes the maximum input length limits of pretrained encoding models. For +preference optimization, we provide decomposed CLIP-based preference models to +fine-tune diffusion models. Specifically, to utilize CLIP-based preference +models for T2I alignment, we delve into their scoring mechanisms and find that +the preference scores can be decomposed into two components: a text-relevant +part that measures T2I alignment and a text-irrelevant part that assesses other +visual aspects of human preference. Additionally, we find that the +text-irrelevant part contributes to a common overfitting problem during +fine-tuning. To address this, we propose a reweighting strategy that assigns +different weights to these two components, thereby reducing overfitting and +enhancing alignment. After fine-tuning $512 \times 512$ Stable Diffusion (SD) +v1.5 for about 20 hours using our method, the fine-tuned SD outperforms +stronger foundation models in T2I alignment, such as PixArt-$\alpha$ and +Kandinsky v2.2. The code is available at +https://github.com/luping-liu/LongAlign. + +
+
+
+
+
+ + ♻ ☆ SCC-YOLO: An Improved Object Detector for Assisting in Brain Tumor + Diagnosis + + +
+ Brain tumors can lead to neurological dysfunction, cognitive and +psychological changes, increased intracranial pressure, and seizures, posing +significant risks to health. The You Only Look Once (YOLO) series has shown +superior accuracy in medical imaging object detection. This paper presents a +novel SCC-YOLO architecture that integrates the SCConv module into YOLOv9. The +SCConv module optimizes convolutional efficiency by reducing spatial and +channel redundancy, enhancing image feature learning. We examine the effects of +different attention mechanisms with YOLOv9 for brain tumor detection using the +Br35H dataset and our custom dataset (Brain_Tumor_Dataset). Results indicate +that SCC-YOLO improved mAP50 by 0.3% on the Br35H dataset and by 0.5% on our +custom dataset compared to YOLOv9. SCC-YOLO achieves state-of-the-art +performance in brain tumor detection. + +
+
+
+
+
+ + ♻ ☆ IRisPath: Enhancing Costmap for Off-Road Navigation with Robust IR-RGB + Fusion for Improved Day and Night Traversability + + +
+ Autonomous off-road navigation is required for applications in agriculture, +construction, search and rescue and defence. Traditional on-road autonomous +methods struggle with dynamic terrains, leading to poor vehicle control in +off-road conditions. Recent deep-learning models have used perception sensors +along with kinesthetic feedback for navigation on such terrains. However, this +approach has out-of-domain uncertainty. Factors like change in time of day and +weather impacts the performance of the model. We propose a multi modal fusion +network "IRisPath" capable of using Thermal and RGB images to provide +robustness against dynamic weather and light conditions. To aid further works +in this domain, we also open-source a day-night dataset with Thermal and RGB +images along with pseudo-labels for traversability. In order to co-register for +fusion model we also develop a novel method for targetless extrinsic +calibration of Thermal, LiDAR and RGB cameras with translation accuracy of ++/-1.7cm and rotation accuracy of +/-0.827degrees. + +
+
+
+
+
+ + ♻ ☆ End-to-End Augmentation Hyperparameter Tuning for Self-Supervised + Anomaly Detection + + +
+ Self-supervised learning (SSL) has emerged as a promising paradigm that +presents supervisory signals to real-world problems, bypassing the extensive +cost of manual labeling. Consequently, self-supervised anomaly detection (SSAD) +has seen a recent surge of interest, since SSL is especially attractive for +unsupervised tasks. However, recent works have reported that the choice of a +data augmentation function has significant impact on the accuracy of SSAD, +posing augmentation search as an essential but nontrivial problem with the lack +of labeled validation data. In this paper, we introduce ST-SSAD, the first +systematic approach for rigorous augmentation tuning on SSAD. To this end, our +work presents two key contributions. The first is a new unsupervised validation +loss that quantifies the alignment between augmented training data and +unlabeled validation data. The second is new differentiable augmentation +functions, allowing data augmentation hyperparameter(s) to be tuned in an +end-to-end manner. Experiments on two testbeds with semantic class anomalies +and subtle industrial defects show that ST-SSAD gives significant performance +gains over existing works. + +
+
+
+
+
+ + ♻ ☆ AuroraCap: Efficient, Performant Video Detailed Captioning and a New + Benchmark ICLR 2025 + + +
+ Video detailed captioning is a key task which aims to generate comprehensive +and coherent textual descriptions of video content, benefiting both video +understanding and generation. In this paper, we propose AuroraCap, a video +captioner based on a large multimodal model. We follow the simplest +architecture design without additional parameters for temporal modeling. To +address the overhead caused by lengthy video sequences, we implement the token +merging strategy, reducing the number of input visual tokens. Surprisingly, we +found that this strategy results in little performance loss. AuroraCap shows +superior performance on various video and image captioning benchmarks, for +example, obtaining a CIDEr of 88.9 on Flickr30k, beating GPT-4V (55.3) and +Gemini-1.5 Pro (82.2). However, existing video caption benchmarks only include +simple descriptions, consisting of a few dozen words, which limits research in +this field. Therefore, we develop VDC, a video detailed captioning benchmark +with over one thousand carefully annotated structured captions. In addition, we +propose a new LLM-assisted metric VDCscore for bettering evaluation, which +adopts a divide-and-conquer strategy to transform long caption evaluation into +multiple short question-answer pairs. With the help of human Elo ranking, our +experiments show that this benchmark better correlates with human judgments of +video detailed captioning quality. + +
+
+ comment: Accepted to ICLR 2025. Code, docs, weight, benchmark and training + data are all avaliable at https://rese1f.github.io/aurora-web/ +
+
+
+
+
+ + ♻ ☆ EMT: A Visual Multi-Task Benchmark Dataset for Autonomous Driving in the + Arab Gulf Region + + +
+ This paper introduces the Emirates Multi-Task (EMT) dataset - the first +publicly available dataset for autonomous driving collected in the Arab Gulf +region. The EMT dataset captures the unique road topology, high traffic +congestion, and distinctive characteristics of the Gulf region, including +variations in pedestrian clothing and weather conditions. It contains over +30,000 frames from a dash-camera perspective, along with 570,000 annotated +bounding boxes, covering approximately 150 kilometers of driving routes. The +EMT dataset supports three primary tasks: tracking, trajectory forecasting and +intention prediction. Each benchmark dataset is complemented with corresponding +evaluations: (1) multi-agent tracking experiments, focusing on multi-class +scenarios and occlusion handling; (2) trajectory forecasting evaluation using +deep sequential and interaction-aware models; and (3) intention benchmark +experiments conducted for predicting agents intentions from observed +trajectories. The dataset is publicly available at avlab.io/emt-dataset, and +pre-processing scripts along with evaluation models can be accessed at +github.com/AV-Lab/emt-dataset. + +
+
+ comment: 19 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Measuring Anxiety Levels with Head Motion Patterns in Severe Depression + Population + + +
+ Depression and anxiety are prevalent mental health disorders that frequently +cooccur, with anxiety significantly influencing both the manifestation and +treatment of depression. An accurate assessment of anxiety levels in +individuals with depression is crucial to develop effective and personalized +treatment plans. This study proposes a new noninvasive method for quantifying +anxiety severity by analyzing head movements -- specifically speed, +acceleration, and angular displacement -- during video-recorded interviews with +patients suffering from severe depression. Using data from a new CALYPSO +Depression Dataset, we extracted head motion characteristics and applied +regression analysis to predict clinically evaluated anxiety levels. Our results +demonstrate a high level of precision, achieving a mean absolute error (MAE) of +0.35 in predicting the severity of psychological anxiety based on head movement +patterns. This indicates that our approach can enhance the understanding of +anxiety's role in depression and assist psychiatrists in refining treatment +strategies for individuals. + +
+
+ comment: 19th IEEE International Conference on Automatic Face and Gesture + Recognition (FG), 2025 +
+
+
+
+
+ + ♻ ☆ Few-Class Arena: A Benchmark for Efficient Selection of Vision Models + and Dataset Difficulty Measurement + + +
+ We propose Few-Class Arena (FCA), as a unified benchmark with focus on +testing efficient image classification models for few classes. A wide variety +of benchmark datasets with many classes (80-1000) have been created to assist +Computer Vision architectural evolution. An increasing number of vision models +are evaluated with these many-class datasets. However, real-world applications +often involve substantially fewer classes of interest (2-10). This gap between +many and few classes makes it difficult to predict performance of the few-class +applications using models trained on the available many-class datasets. To +date, little has been offered to evaluate models in this Few-Class Regime. We +conduct a systematic evaluation of the ResNet family trained on ImageNet +subsets from 2 to 1000 classes, and test a wide spectrum of Convolutional +Neural Networks and Transformer architectures over ten datasets by using our +newly proposed FCA tool. Furthermore, to aid an up-front assessment of dataset +difficulty and a more efficient selection of models, we incorporate a +difficulty measure as a function of class similarity. FCA offers a new tool for +efficient machine learning in the Few-Class Regime, with goals ranging from a +new efficient class similarity proposal, to lightweight model architecture +design, to a new scaling law. FCA is user-friendly and can be easily extended +to new models and datasets, facilitating future research work. Our benchmark is +available at https://github.com/bryanbocao/fca. + +
+
+ comment: 10 pages, 32 pages including References and Appendix, 19 figures, 8 + tables +
+
+
+
+
+ + ♻ ☆ Modeling and Analysis of Spatial and Temporal Land Clutter Statistics in + SAR Imaging Based on MSTAR Data + + +
+ The statistical analysis of land clutter for Synthetic Aperture Radar (SAR) +imaging has become an increasingly important subject for research and +investigation. It is also absolutely necessary for designing robust algorithms +capable of performing the task of target detection in the background clutter. +Any attempt to extract the energy of the desired targets from the land clutter +requires complete knowledge of the statistical properties of the background +clutter. In this paper, the spatial as well as the temporal characteristics of +the land clutter are studied. Since the data for each image has been collected +based on a different aspect angle; therefore, the temporal analysis contains +variation in the aspect angle. Consequently, the temporal analysis includes the +characteristics of the radar cross section with respect to the aspect angle +based on which the data has been collected. In order to perform the statistical +analysis, several well-known and relevant distributions, namely, Weibull, +Log-normal, Gamma, and Rayleigh are considered as prime candidates to model the +land clutter. The goodness-of-fit test is based on the Kullback-Leibler (KL) +Divergence metric. The detailed analysis presented in this paper demonstrates +that the Weibull distribution is a more accurate fit for the +temporal-aspect-angle statistical analysis while the Rayleigh distribution +models the spatial characteristics of the background clutter with higher +accuracy. Finally, based on the aforementioned statistical analyses and by +utilizing the Constant False Alarm Rate (CFAR) algorithm, we perform target +detection in land clutter. The overall verification of the analysis is +performed by exploiting the Moving and Stationary Target Acquisition and +Recognition (MSTAR) data-set, which has been collected in spotlight mode at +X-band, and the results are presented. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2409.02155 +
+
+
+
+
+ + ♻ ☆ Snuffy: Efficient Whole Slide Image Classifier ECCV 2024 + + +
+ Whole Slide Image (WSI) classification with multiple instance learning (MIL) +in digital pathology faces significant computational challenges. Current +methods mostly rely on extensive self-supervised learning (SSL) for +satisfactory performance, requiring long training periods and considerable +computational resources. At the same time, no pre-training affects performance +due to domain shifts from natural images to WSIs. We introduce Snuffy +architecture, a novel MIL-pooling method based on sparse transformers that +mitigates performance loss with limited pre-training and enables continual +few-shot pre-training as a competitive option. Our sparsity pattern is tailored +for pathology and is theoretically proven to be a universal approximator with +the tightest probabilistic sharp bound on the number of layers for sparse +transformers, to date. We demonstrate Snuffy's effectiveness on CAMELYON16 and +TCGA Lung cancer datasets, achieving superior WSI and patch-level accuracies. +The code is available on https://github.com/jafarinia/snuffy. + +
+
+ comment: Accepted for ECCV 2024 +
+
+
+
+
+ + ♻ ☆ DynRefer: Delving into Region-level Multimodal Tasks via Dynamic + Resolution CVPR 2025 + + +
+ One fundamental task of multimodal models is to translate referred image +regions to human preferred language descriptions. Existing methods, however, +ignore the resolution adaptability needs of different tasks, which hinders them +to find out precise language descriptions. In this study, we propose a DynRefer +approach, to pursue high-accuracy region-level referring through mimicking the +resolution adaptability of human visual cognition. During training, DynRefer +stochastically aligns language descriptions of multimodal tasks with images of +multiple resolutions, which are constructed by nesting a set of random views +around the referred region. During inference, DynRefer performs selectively +multimodal referring by sampling proper region representations for tasks from +the nested views based on image and task priors. This allows the visual +information for referring to better match human preferences, thereby improving +the representational adaptability of region-level multimodal models. +Experiments show that DynRefer brings mutual improvement upon broad tasks +including region-level captioning, open-vocabulary region recognition and +attribute detection. Furthermore, DynRefer achieves state-of-the-art results on +multiple region-level multimodal tasks using a single model. Code is available +at https://github.com/callsys/DynRefer. + +
+
+ comment: Accepted in CVPR 2025. Code is available at + https://github.com/callsys/DynRefer +
+
+
+
+
+ + ♻ ☆ Neural Finite-State Machines for Surgical Phase Recognition + + +
+ Surgical phase recognition (SPR) is crucial for applications in workflow +optimization, performance evaluation, and real-time intervention guidance. +However, current deep learning models often struggle with fragmented +predictions, failing to capture the sequential nature of surgical workflows. We +propose the Neural Finite-State Machine (NFSM), a novel approach that enforces +temporal coherence by integrating classical state-transition priors with modern +neural networks. NFSM leverages learnable global state embeddings as unique +phase identifiers and dynamic transition tables to model phase-to-phase +progressions. Additionally, a future phase forecasting mechanism employs +repeated frame padding to anticipate upcoming transitions. Implemented as a +plug-and-play module, NFSM can be integrated into existing SPR pipelines +without changing their core architectures. We demonstrate state-of-the-art +performance across multiple benchmarks, including a significant improvement on +the BernBypass70 dataset - raising video-level accuracy by 0.9 points and +phase-level precision, recall, F1-score, and mAP by 3.8, 3.1, 3.3, and 4.1, +respectively. Ablation studies confirm each component's effectiveness and the +module's adaptability to various architectures. By unifying finite-state +principles with deep learning, NFSM offers a robust path toward consistent, +long-term surgical video analysis. + +
+
+
+
+
+ + ♻ ☆ Refinement Module based on Parse Graph of Feature Map for Human Pose + Estimation + + +
+ Parse graphs of the human body can be obtained in the human brain to help +humans complete the human Pose Estimation better (HPE). It contains a +hierarchical structure, like a tree structure, and context relations among +nodes. To equip models with such capabilities, many researchers predefine the +parse graph of body structure to design HPE frameworks. However, these +frameworks struggle to adapt to instances that deviate from the predefined +parse graph and are often parameter-heavy. Unlike them, we view the feature map +holistically, much like the human body. It can be optimized using parse graphs, +where each node's feature is an implicit expression rather than a fixed one. +This allows it to adapt to more instances, unconstrained by rigid structural +features. In this paper, we design the Refinement Module based on the Parse +Graph of feature map (RMPG), which includes two stages: top-down decomposition +and bottom-up combination. In the first stage, the feature map is decomposed +into multiple sub-feature maps along the channel. In the second stage, the +context relations of sub-feature maps are calculated to obtain their respective +context information and the sub-feature maps with context information are +concatenated along channels to obtain the refined feature map. Additionally, we +design a hierarchical network with fewer parameters using multiple RMPG modules +to model the context relations and hierarchies in the parse graph of body +structure for HPE, some of which are supervised to obtain context relations +among body parts. Our network achieves excellent results on multiple mainstream +human pose datasets. More importantly, the effectiveness of RMPG is proven on +different methods. The code of RMPG will be open. + +
+
+
+
+
+ + ♻ ☆ Towards Robust Algorithms for Surgical Phase Recognition via Digital + Twin Representation + + +
+ Surgical phase recognition (SPR) is an integral component of surgical data +science, enabling high-level surgical analysis. End-to-end trained neural +networks that predict surgical phase directly from videos have shown excellent +performance on benchmarks. However, these models struggle with robustness due +to non-causal associations in the training set. Our goal is to improve model +robustness to variations in the surgical videos by leveraging the digital twin +(DT) paradigm -- an intermediary layer to separate high-level analysis (SPR) +from low-level processing. As a proof of concept, we present a DT +representation-based framework for SPR from videos. The framework employs +vision foundation models with reliable low-level scene understanding to craft +DT representation. We embed the DT representation in place of raw video inputs +in the state-of-the-art SPR model. The framework is trained on the Cholec80 +dataset and evaluated on out-of-distribution (OOD) and corrupted test samples. +Contrary to the vulnerability of the baseline model, our framework demonstrates +strong robustness on both OOD and corrupted samples, with a video-level +accuracy of 80.3 on a highly corrupted Cholec80 test set, 67.9 on the +challenging CRCD dataset, and 99.8 on an internal robotic surgery dataset, +outperforming the baseline by 3.9, 16.8, and 90.9 respectively. We also find +that using DT representation as an augmentation to the raw input can +significantly improve model robustness. Our findings lend support to the thesis +that DT representations are effective in enhancing model robustness. Future +work will seek to improve the feature informativeness and incorporate +interpretability for a more comprehensive framework. + +
+
+
+
+
+ + ♻ ☆ SoK: Systematization and Benchmarking of Deepfake Detectors in a Unified + Framework EuroS&P '25 + + +
+ Deepfakes have rapidly emerged as a serious threat to society due to their +ease of creation and dissemination, triggering the accelerated development of +detection technologies. However, many existing detectors rely on labgenerated +datasets for validation, which may not prepare them for novel, real-world +deepfakes. This paper extensively reviews and analyzes state-of-the-art +deepfake detectors, evaluating them against several critical criteria. These +criteria categorize detectors into 4 high-level groups and 13 finegrained +sub-groups, aligned with a unified conceptual framework we propose. This +classification offers practical insights into the factors affecting detector +efficacy. We evaluate the generalizability of 16 leading detectors across +comprehensive attack scenarios, including black-box, white-box, and graybox +settings. Our systematized analysis and experiments provide a deeper +understanding of deepfake detectors and their generalizability, paving the way +for future research and the development of more proactive defenses against +deepfakes. + +
+
+ comment: 20 pages, 6 figures, 7 table, Accepted at IEEE European Symposium on + security and privacy 2025 (EuroS&P '25) +
+
+
+
+
+ + ♻ ☆ Image Watermarks are Removable Using Controllable Regeneration from + Clean Noise ICLR2025 + + +
+ Image watermark techniques provide an effective way to assert ownership, +deter misuse, and trace content sources, which has become increasingly +essential in the era of large generative models. A critical attribute of +watermark techniques is their robustness against various manipulations. In this +paper, we introduce a watermark removal approach capable of effectively +nullifying state-of-the-art watermarking techniques. Our primary insight +involves regenerating the watermarked image starting from a clean Gaussian +noise via a controllable diffusion model, utilizing the extracted semantic and +spatial features from the watermarked image. The semantic control adapter and +the spatial control network are specifically trained to control the denoising +process towards ensuring image quality and enhancing consistency between the +cleaned image and the original watermarked image. To achieve a smooth trade-off +between watermark removal performance and image consistency, we further propose +an adjustable and controllable regeneration scheme. This scheme adds varying +numbers of noise steps to the latent representation of the watermarked image, +followed by a controlled denoising process starting from this noisy latent +representation. As the number of noise steps increases, the latent +representation progressively approaches clean Gaussian noise, facilitating the +desired trade-off. We apply our watermark removal methods across various +watermarking techniques, and the results demonstrate that our methods offer +superior visual consistency/quality and enhanced watermark removal performance +compared to existing regeneration approaches. Our code is available at +https://github.com/yepengliu/CtrlRegen. + +
+
+ comment: ICLR2025 +
+
+
+
+
+ + ♻ ☆ Adaptive Neural Networks for Intelligent Data-Driven Development + + +
+ Advances in machine learning methods for computer vision tasks have led to +their consideration for safety-critical applications like autonomous driving. +However, effectively integrating these methods into the automotive development +lifecycle remains challenging. Since the performance of machine learning +algorithms relies heavily on the training data provided, the data and model +development lifecycle play a key role in successfully integrating these +components into the product development lifecycle. Existing models frequently +encounter difficulties recognizing or adapting to novel instances not present +in the original training dataset. This poses a significant risk for reliable +deployment in dynamic environments. To address this challenge, we propose an +adaptive neural network architecture and an iterative development framework +that enables users to efficiently incorporate previously unknown objects into +the current perception system. Our approach builds on continuous learning, +emphasizing the necessity of dynamic updates to reflect real-world deployment +conditions. Specifically, we introduce a pipeline with three key components: +(1) a scalable network extension strategy to integrate new classes while +preserving existing performance, (2) a dynamic OoD detection component that +requires no additional retraining for newly added classes, and (3) a +retrieval-based data augmentation process tailored for safety-critical +deployments. The integration of these components establishes a pragmatic and +adaptive pipeline for the continuous evolution of perception systems in the +context of autonomous driving. + +
+
+ comment: 8 pages, 3 figures, and 3 tables +
+
+
+
+
+ + ♻ ☆ Segment-Level Road Obstacle Detection Using Visual Foundation Model + Priors and Likelihood Ratios + + +
+ Detecting road obstacles is essential for autonomous vehicles to navigate +dynamic and complex traffic environments safely. Current road obstacle +detection methods typically assign a score to each pixel and apply a threshold +to generate final predictions. However, selecting an appropriate threshold is +challenging, and the per-pixel classification approach often leads to +fragmented predictions with numerous false positives. In this work, we propose +a novel method that leverages segment-level features from visual foundation +models and likelihood ratios to predict road obstacles directly. By focusing on +segments rather than individual pixels, our approach enhances detection +accuracy, reduces false positives, and offers increased robustness to scene +variability. We benchmark our approach against existing methods on the +RoadObstacle and LostAndFound datasets, achieving state-of-the-art performance +without needing a predefined threshold. + +
+
+ comment: 10 pages, 4 figures, and 1 table, to be published in VISAPP 2025 +
+
+
+
+
+ + ♻ ☆ VisRAG: Vision-based Retrieval-augmented Generation on Multi-modality + Documents + + +
+ Retrieval-augmented generation (RAG) is an effective technique that enables +large language models (LLMs) to utilize external knowledge sources for +generation. However, current RAG systems are solely based on text, rendering it +impossible to utilize vision information like layout and images that play +crucial roles in real-world multi-modality documents. In this paper, we +introduce VisRAG, which tackles this issue by establishing a vision-language +model (VLM)-based RAG pipeline. In this pipeline, instead of first parsing the +document to obtain text, the document is directly embedded using a VLM as an +image and then retrieved to enhance the generation of a VLM. Compared to +traditional text-based RAG, VisRAG maximizes the retention and utilization of +the data information in the original documents, eliminating the information +loss introduced during the parsing process. We collect both open-source and +synthetic data to train the retriever in VisRAG and explore a variety of +generation methods. Experiments demonstrate that VisRAG outperforms traditional +RAG in both the retrieval and generation stages, achieving a 20--40% end-to-end +performance gain over traditional text-based RAG pipeline. Further analysis +reveals that VisRAG is efficient in utilizing training data and demonstrates +strong generalization capability, positioning it as a promising solution for +RAG on multi-modality documents. Our code and data are available at +https://github.com/openbmb/visrag. + +
+
+
+
+
+ + ♻ ☆ Score Forgetting Distillation: A Swift, Data-Free Method for Machine + Unlearning in Diffusion Models ICLR 2025 + + +
+ The machine learning community is increasingly recognizing the importance of +fostering trust and safety in modern generative AI (GenAI) models. We posit +machine unlearning (MU) as a crucial foundation for developing safe, secure, +and trustworthy GenAI models. Traditional MU methods often rely on stringent +assumptions and require access to real data. This paper introduces Score +Forgetting Distillation (SFD), an innovative MU approach that promotes the +forgetting of undesirable information in diffusion models by aligning the +conditional scores of "unsafe" classes or concepts with those of "safe" ones. +To eliminate the need for real data, our SFD framework incorporates a +score-based MU loss into the score distillation objective of a pretrained +diffusion model. This serves as a regularization term that preserves desired +generation capabilities while enabling the production of synthetic data through +a one-step generator. Our experiments on pretrained label-conditional and +text-to-image diffusion models demonstrate that our method effectively +accelerates the forgetting of target classes or concepts during generation, +while preserving the quality of other classes or concepts. This unlearned and +distilled diffusion not only pioneers a novel concept in MU but also +accelerates the generation speed of diffusion models. Our experiments and +studies on a range of diffusion models and datasets confirm that our approach +is generalizable, effective, and advantageous for MU in diffusion models. Code +is available at https://github.com/tqch/score-forgetting-distillation. +($\textbf{Warning:}$ This paper contains sexually explicit imagery, discussions +of pornography, racially-charged terminology, and other content that some +readers may find disturbing, distressing, and/or offensive.) + +
+
+ comment: ICLR 2025 +
+
+
+
+
+ + ♻ ☆ GP-GS: Gaussian Processes for Enhanced Gaussian Splatting + + +
+ 3D Gaussian Splatting has emerged as an efficient photorealistic novel view +synthesis method. However, its reliance on sparse Structure-from-Motion (SfM) +point clouds consistently compromises the scene reconstruction quality. To +address these limitations, this paper proposes a novel 3D reconstruction +framework Gaussian Processes Gaussian Splatting (GP-GS), where a multi-output +Gaussian Process model is developed to achieve adaptive and uncertainty-guided +densification of sparse SfM point clouds. Specifically, we propose a dynamic +sampling and filtering pipeline that adaptively expands the SfM point clouds by +leveraging GP-based predictions to infer new candidate points from the input 2D +pixels and depth maps. The pipeline utilizes uncertainty estimates to guide the +pruning of high-variance predictions, ensuring geometric consistency and +enabling the generation of dense point clouds. The densified point clouds +provide high-quality initial 3D Gaussians to enhance reconstruction +performance. Extensive experiments conducted on synthetic and real-world +datasets across various scales validate the effectiveness and practicality of +the proposed framework. + +
+
+ comment: 14 pages,11 figures +
+
+
+
+
+ + ♻ ☆ All Seeds Are Not Equal: Enhancing Compositional Text-to-Image + Generation with Reliable Random Seeds + + +
+ Text-to-image diffusion models have demonstrated remarkable capability in +generating realistic images from arbitrary text prompts. However, they often +produce inconsistent results for compositional prompts such as "two dogs" or "a +penguin on the right of a bowl". Understanding these inconsistencies is crucial +for reliable image generation. In this paper, we highlight the significant role +of initial noise in these inconsistencies, where certain noise patterns are +more reliable for compositional prompts than others. Our analyses reveal that +different initial random seeds tend to guide the model to place objects in +distinct image areas, potentially adhering to specific patterns of camera +angles and image composition associated with the seed. To improve the model's +compositional ability, we propose a method for mining these reliable cases, +resulting in a curated training set of generated images without requiring any +manual annotation. By fine-tuning text-to-image models on these generated +images, we significantly enhance their compositional capabilities. For +numerical composition, we observe relative increases of 29.3% and 19.5% for +Stable Diffusion and PixArt-{\alpha}, respectively. Spatial composition sees +even larger gains, with 60.7% for Stable Diffusion and 21.1% for +PixArt-{\alpha}. + +
+
+
+
+
+
+
+
+ + Artificial Intelligence 78 + +
+
+
+ + ♻ ☆ Automatically Improving LLM-based Verilog Generation using EDA Tool + Feedback + + +
+ Traditionally, digital hardware designs are written in the Verilog hardware +description language (HDL) and debugged manually by engineers. This can be +time-consuming and error-prone for complex designs. Large Language Models +(LLMs) are emerging as a potential tool to help generate fully functioning HDL +code, but most works have focused on generation in the single-shot capacity: +i.e., run and evaluate, a process that does not leverage debugging and, as +such, does not adequately reflect a realistic development process. In this +work, we evaluate the ability of LLMs to leverage feedback from electronic +design automation (EDA) tools to fix mistakes in their own generated Verilog. +To accomplish this, we present an open-source, highly customizable framework, +AutoChip, which combines conversational LLMs with the output from Verilog +compilers and simulations to iteratively generate and repair Verilog. To +determine the success of these LLMs we leverage the VerilogEval benchmark set. +We evaluate four state-of-the-art conversational LLMs, focusing on readily +accessible commercial models. EDA tool feedback proved to be consistently more +effective than zero-shot prompting only with GPT-4o, the most computationally +complex model we evaluated. In the best case, we observed a 5.8% increase in +the number of successful designs with a 34.2% decrease in cost over the best +zero-shot results. Mixing smaller models with this larger model at the end of +the feedback iterations resulted in equally as much success as with GPT-4o +using feedback, but incurred 41.9% lower cost (corresponding to an overall +decrease in cost over zero-shot by 89.6%). + +
+
+
+
+
+ + ♻ ☆ Eagle: Exploring The Design Space for Multimodal LLMs with Mixture of + Encoders + + +
+ The ability to accurately interpret complex visual information is a crucial +topic of multimodal large language models (MLLMs). Recent work indicates that +enhanced visual perception significantly reduces hallucinations and improves +performance on resolution-sensitive tasks, such as optical character +recognition and document analysis. A number of recent MLLMs achieve this goal +using a mixture of vision encoders. Despite their success, there is a lack of +systematic comparisons and detailed ablation studies addressing critical +aspects, such as expert selection and the integration of multiple vision +experts. This study provides an extensive exploration of the design space for +MLLMs using a mixture of vision encoders and resolutions. Our findings reveal +several underlying principles common to various existing strategies, leading to +a streamlined yet effective design approach. We discover that simply +concatenating visual tokens from a set of complementary vision encoders is as +effective as more complex mixing architectures or strategies. We additionally +introduce Pre-Alignment to bridge the gap between vision-focused encoders and +language tokens, enhancing model coherence. The resulting family of MLLMs, +Eagle, surpasses other leading open-source models on major MLLM benchmarks. + +
+
+ comment: Github: https://github.com/NVlabs/Eagle, HuggingFace: + https://huggingface.co/NVEagle +
+
+
+
+
+ + ♻ ☆ ActionReasoningBench: Reasoning about Actions with and without + Ramification Constraints ICLR 2025 + + +
+ Reasoning about Actions and Change (RAC) has historically played a pivotal +role in solving foundational AI problems, such as the frame problem. It has +driven advancements in AI fields, such as non-monotonic and commonsense +reasoning. RAC remains crucial for AI systems that operate in dynamic +environments, engage in interactive scenarios, or rely on commonsense +reasoning. Despite substantial advances made by Large Language Models (LLMs) in +various AI domains, their performance in RAC remains underexplored. To address +this gap, we introduce a new diagnostic benchmark, ActionReasoningBench, which +encompasses 8 domains and includes questions for up to 19 action sequences. +This benchmark rigorously evaluates LLMs across six key RAC dimensions: Fluent +Tracking, State Tracking, Action Executability, Effects of Actions, Numerical +RAC, and Composite Questions. LLMs demonstrate average accuracy rates of +73.55%, 65.63%, 58.73%, and 62.38% on the former four dimensions, which are +frequently discussed in RAC literature. However, the performance on the latter +two dimensions, which introduce complex and novel reasoning questions, the +average performance of LLMs is lowered to 33.16% and 51.19%, respectively, +reflecting a 17.9% performance decline. We also introduce new ramification +constraints to capture the indirect effects of actions, providing deeper +insights into RAC challenges. Our evaluation of state-of-the-art LLMs, +including both open-source and commercial models, reveals challenges across all +RAC dimensions, particularly in handling ramifications, with GPT-4o failing to +solve any question and o1-preview achieving a score of only 18.4%. + +
+
+ comment: Accepted in ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Disentangling Representations through Multi-task Learning + + +
+ Intelligent perception and interaction with the world hinges on internal +representations that capture its underlying structure (''disentangled'' or +''abstract'' representations). Disentangled representations serve as world +models, isolating latent factors of variation in the world along approximately +orthogonal directions, thus facilitating feature-based generalization. We +provide experimental and theoretical results guaranteeing the emergence of +disentangled representations in agents that optimally solve multi-task evidence +accumulation classification tasks, canonical in the neuroscience literature. +The key conceptual finding is that, by producing accurate multi-task +classification estimates, a system implicitly represents a set of coordinates +specifying a disentangled representation of the underlying latent state of the +data it receives. The theory provides conditions for the emergence of these +representations in terms of noise, number of tasks, and evidence accumulation +time. We experimentally validate these predictions in RNNs trained to +multi-task, which learn disentangled representations in the form of continuous +attractors, leading to zero-shot out-of-distribution (OOD) generalization in +predicting latent factors. We demonstrate the robustness of our framework +across autoregressive architectures, decision boundary geometries and in tasks +requiring classification confidence estimation. We find that transformers are +particularly suited for disentangling representations, which might explain +their unique world understanding abilities. Overall, our framework establishes +a formal link between competence at multiple tasks and the formation of +disentangled, interpretable world models in both biological and artificial +systems, and helps explain why ANNs often arrive at human-interpretable +concepts, and how they both may acquire exceptional zero-shot generalization +capabilities. + +
+
+ comment: 43 pages, 17 figures +
+
+
+
+
+ + ♻ ☆ We Have a Package for You! A Comprehensive Analysis of Package + Hallucinations by Code Generating LLMs USENIX Security + + +
+ The reliance of popular programming languages such as Python and JavaScript +on centralized package repositories and open-source software, combined with the +emergence of code-generating Large Language Models (LLMs), has created a new +type of threat to the software supply chain: package hallucinations. These +hallucinations, which arise from fact-conflicting errors when generating code +using LLMs, represent a novel form of package confusion attack that poses a +critical threat to the integrity of the software supply chain. This paper +conducts a rigorous and comprehensive evaluation of package hallucinations +across different programming languages, settings, and parameters, exploring how +a diverse set of models and configurations affect the likelihood of generating +erroneous package recommendations and identifying the root causes of this +phenomenon. Using 16 popular LLMs for code generation and two unique prompt +datasets, we generate 576,000 code samples in two programming languages that we +analyze for package hallucinations. Our findings reveal that that the average +percentage of hallucinated packages is at least 5.2% for commercial models and +21.7% for open-source models, including a staggering 205,474 unique examples of +hallucinated package names, further underscoring the severity and pervasiveness +of this threat. To overcome this problem, we implement several hallucination +mitigation strategies and show that they are able to significantly reduce the +number of package hallucinations while maintaining code quality. Our +experiments and findings highlight package hallucinations as a persistent and +systemic phenomenon while using state-of-the-art LLMs for code generation, and +a significant challenge which deserves the research community's urgent +attention. + +
+
+ comment: To appear in the 2025 USENIX Security Symposium. 22 pages, 14 + figures, 8 tables. Edited from original version for submission to a different + conference. No change to original results or findings +
+
+
+
+
+ + ♻ ☆ C-Causal Blindness + + +
+ This text is concerned with a hypothetical flavour of cognitive blindness +referred to in this paper as \textit{C-Causal Blindness} or C-CB. A cognitive +blindness where the policy to obtain the objective leads to the state to be +avoided. A literal example of C-CB would be \textit{Kurt G\"odel's} decision to +starve for \textit{"fear of being poisoned"} - take this to be premise +\textbf{A}. The objective being \textit{"to avoid being poisoned (so as to not +die)"}: \textbf{C}, the plan or policy being \textit{"don't eat"}: \textbf{B}, +and the actual outcome having been \textit{"dying"}: $\lnot$\textbf{C} - the +state that G\"odel wanted to avoid to begin with. G\"odel pursued a strategy +that caused the result he wanted to avoid. An experimental computational +framework is proposed to show the isomorphic relationship between C-CB in brain +computations, logic, and computer computations using a new proposed algorithm: +a Weighted Hidden Markov Model. + +
+
+ comment: restructuring +
+
+
+
+
+ + ♻ ☆ Range, not Independence, Drives Modularity in Biologically Inspired + Representations + + +
+ Why do biological and artificial neurons sometimes modularise, each encoding +a single meaningful variable, and sometimes entangle their representation of +many variables? In this work, we develop a theory of when biologically inspired +networks -- those that are nonnegative and energy efficient -- modularise their +representation of source variables (sources). We derive necessary and +sufficient conditions on a sample of sources that determine whether the neurons +in an optimal biologically-inspired linear autoencoder modularise. Our theory +applies to any dataset, extending far beyond the case of statistical +independence studied in previous work. Rather we show that sources modularise +if their support is ``sufficiently spread''. From this theory, we extract and +validate predictions in a variety of empirical studies on how data distribution +affects modularisation in nonlinear feedforward and recurrent neural networks +trained on supervised and unsupervised tasks. Furthermore, we apply these ideas +to neuroscience data, showing that range independence can be used to understand +the mixing or modularising of spatial and reward information in entorhinal +recordings in seemingly conflicting experiments. Further, we use these results +to suggest alternate origins of mixed-selectivity, beyond the predominant +theory of flexible nonlinear classification. In sum, our theory prescribes +precise conditions on when neural activities modularise, providing tools for +inducing and elucidating modular representations in brains and machines. + +
+
+ comment: 47 pages, 17 figures. WD and KH contributed equally; LH and JHL + contributed equally +
+
+
+
+
+ + ♻ ☆ Inference to the Best Explanation in Large Language Models + + +
+ While Large Language Models (LLMs) have found success in real-world +applications, their underlying explanatory process is still poorly understood. +This paper proposes IBE-Eval, a framework inspired by philosophical accounts on +Inference to the Best Explanation (IBE) to advance the interpretation and +evaluation of LLMs' explanations. IBE-Eval estimates the plausibility of +natural language explanations through a combination of explicit logical and +linguistic features including: consistency, parsimony, coherence, and +uncertainty. Extensive experiments are conducted on Causal Question Answering +(CQA), where \textit{IBE-Eval} is tasked to select the most plausible causal +explanation amongst competing ones generated by LLMs (i.e., GPT 3.5 and Llama +2). The experiments reveal that IBE-Eval can successfully identify the best +explanation with up to 77\% accuracy ($\approx 27\%$ above random), improving +upon a GPT 3.5-as-a-Judge baseline ($\approx+17\%$) while being intrinsically +more efficient and interpretable. Additional analyses suggest that, despite +model-specific variances, LLM-generated explanations tend to conform to IBE +criteria and that IBE-Eval is significantly correlated with human judgment, +opening up opportunities for future development of automated explanation +verification tools. + +
+
+
+
+
+ + ♻ ☆ Lean Copilot: Large Language Models as Copilots for Theorem Proving in + Lean + + +
+ Neural theorem proving combines large language models (LLMs) with proof +assistants such as Lean, where the correctness of formal proofs can be +rigorously verified, leaving no room for hallucination. With existing neural +theorem provers pretrained on a fixed collection of data and offering valuable +suggestions at times, it is challenging for them to continually prove novel +theorems in a fully autonomous mode, where human insights may be critical. In +this paper, we explore LLMs as copilots that assist humans in proving theorems. +We introduce Lean Copilot, an general framework for running LLM inference +natively in Lean. It enables programmers to build various LLM-based proof +automation tools that integrate seamlessly into the workflow of Lean users. +Lean users can use our pretrained models or bring their own ones that run +either locally (with or without GPUs) or on the cloud. Using Lean Copilot, we +build LLM-based tools that suggest proof steps, complete proof goals, and +select relevant premises. Experimental results on the Mathematics in Lean +textbook demonstrate the effectiveness of our method compared to existing +rule-based proof automation in Lean (aesop). When assisting humans, Lean +Copilot requires only 2.08 manually-entered proof steps on average (3.86 +required by aesop); when automating the theorem proving process, Lean Copilot +automates 74.2% proof steps on average, 85% better than aesop (40.1%). We open +source all code and artifacts under a permissive MIT license to facilitate +further research. + +
+
+ comment: All code and artifacts open-sourced at + https://github.com/lean-dojo/LeanCopilot +
+
+
+
+
+ + ♻ ☆ SWE-Search: Enhancing Software Agents with Monte Carlo Tree Search and + Iterative Refinement + + +
+ Software engineers operating in complex and dynamic environments must +continuously adapt to evolving requirements, learn iteratively from experience, +and reconsider their approaches based on new insights. However, current large +language model (LLM)-based software agents often follow linear, sequential +processes that prevent backtracking and exploration of alternative solutions, +limiting their ability to rethink their strategies when initial approaches +prove ineffective. To address these challenges, we propose SWE-Search, a +multi-agent framework that integrates Monte Carlo Tree Search (MCTS) with a +self-improvement mechanism to enhance software agents' performance on +repository-level software tasks. SWE-Search extends traditional MCTS by +incorporating a hybrid value function that leverages LLMs for both numerical +value estimation and qualitative evaluation. This enables self-feedback loops +where agents iteratively refine their strategies based on both quantitative +numerical evaluations and qualitative natural language assessments of pursued +trajectories. The framework includes a SWE-Agent for adaptive exploration, a +Value Agent for iterative feedback, and a Discriminator Agent that facilitates +multi-agent debate for collaborative decision-making. Applied to the SWE-bench +benchmark, our approach demonstrates a 23% relative improvement in performance +across five models compared to standard open-source agents without MCTS. Our +analysis reveals how performance scales with increased inference-time compute +through deeper search, providing a pathway to improve software agents without +requiring larger models or additional training data. This highlights the +potential of self-evaluation driven search techniques in complex software +engineering environments. + +
+
+ comment: Main body: 10 pages, 5 figures. Appendix: 5 pages, 4 figures. + Open-source codebase +
+
+
+
+
+ + ♻ ☆ Distributed Speculative Inference (DSI): Speculation Parallelism for + Provably Faster Lossless Language Model Inference ICLR 2025 + + +
+ This paper introduces distributed speculative inference (DSI), a novel +inference algorithm that is provably faster than speculative inference (SI) +[leviathan2023, chen2023, miao2024, sun2025, timor2025] and standard +autoregressive inference (non-SI). Like other SI algorithms, DSI operates on +frozen language models (LMs), requiring no training or architectural +modifications, and it preserves the target distribution. Prior studies on SI +have demonstrated empirical speedups over non-SI--but rely on sufficiently fast +and accurate drafters, which are often unavailable in practice. We identify a +gap where SI can be slower than non-SI if drafters are too slow or inaccurate. +We close this gap by proving that DSI is faster than both SI and non-SI--given +any drafters. DSI is therefore not only faster than SI, but also unlocks the +acceleration of LMs for which SI fails. DSI leverages speculation parallelism +(SP), a novel type of task parallelism, to orchestrate target and drafter +instances that overlap in time, establishing a new foundational tradeoff +between computational resources and latency. Our simulations show that DSI is +1.29-1.92x faster than SI in single-node setups for various off-the-shelf LMs +and tasks. We open-source all our code. + +
+
+ comment: Published at ICLR 2025. (Link: + https://openreview.net/forum?id=cJd1BgZ9CS) +
+
+
+
+
+ + ♻ ☆ Unmasking Social Bots: How Confident Are We? + + +
+ Social bots remain a major vector for spreading disinformation on social +media and a menace to the public. Despite the progress made in developing +multiple sophisticated social bot detection algorithms and tools, bot detection +remains a challenging, unsolved problem that is fraught with uncertainty due to +the heterogeneity of bot behaviors, training data, and detection algorithms. +Detection models often disagree on whether to label the same account as bot or +human-controlled. However, they do not provide any measure of uncertainty to +indicate how much we should trust their results. We propose to address both bot +detection and the quantification of uncertainty at the account level - a novel +feature of this research. This dual focus is crucial as it allows us to +leverage additional information related to the quantified uncertainty of each +prediction, thereby enhancing decision-making and improving the reliability of +bot classifications. Specifically, our approach facilitates targeted +interventions for bots when predictions are made with high confidence and +suggests caution (e.g., gathering more data) when predictions are uncertain. + +
+
+ comment: 15 pages, 6 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Detecting Unsuccessful Students in Cybersecurity Exercises in Two + Different Learning Environments + + +
+ This full paper in the research track evaluates the usage of data logged from +cybersecurity exercises in order to predict students who are potentially at +risk of performing poorly. Hands-on exercises are essential for learning since +they enable students to practice their skills. In cybersecurity, hands-on +exercises are often complex and require knowledge of many topics. Therefore, +students may miss solutions due to gaps in their knowledge and become +frustrated, which impedes their learning. Targeted aid by the instructor helps, +but since the instructor's time is limited, efficient ways to detect struggling +students are needed. This paper develops automated tools to predict when a +student is having difficulty. We formed a dataset with the actions of 313 +students from two countries and two learning environments: KYPO CRP and +EDURange. These data are used in machine learning algorithms to predict the +success of students in exercises deployed in these environments. After +extracting features from the data, we trained and cross-validated eight +classifiers for predicting the exercise outcome and evaluated their predictive +power. The contribution of this paper is comparing two approaches to feature +engineering, modeling, and classification performance on data from two learning +environments. Using the features from either learning environment, we were able +to detect and distinguish between successful and struggling students. A +decision tree classifier achieved the highest balanced accuracy and sensitivity +with data from both learning environments. The results show that activity data +from cybersecurity exercises are suitable for predicting student success. In a +potential application, such models can aid instructors in detecting struggling +students and providing targeted help. We publish data and code for building +these models so that others can adopt or adapt them. + +
+
+ comment: Published in the FIE 2024 conference proceedings, see + https://doi.org/10.1109/FIE61694.2024.10893135 +
+
+
+
+
+ + ♻ ☆ Differentiable Weightless Neural Networks + + +
+ We introduce the Differentiable Weightless Neural Network (DWN), a model +based on interconnected lookup tables. Training of DWNs is enabled by a novel +Extended Finite Difference technique for approximate differentiation of binary +values. We propose Learnable Mapping, Learnable Reduction, and Spectral +Regularization to further improve the accuracy and efficiency of these models. +We evaluate DWNs in three edge computing contexts: (1) an FPGA-based hardware +accelerator, where they demonstrate superior latency, throughput, energy +efficiency, and model area compared to state-of-the-art solutions, (2) a +low-power microcontroller, where they achieve preferable accuracy to XGBoost +while subject to stringent memory constraints, and (3) ultra-low-cost chips, +where they consistently outperform small models in both accuracy and projected +hardware area. DWNs also compare favorably against leading approaches for +tabular datasets, with higher average rank. Overall, our work positions DWNs as +a pioneering solution for edge-compatible high-throughput neural networks. + +
+
+
+
+
+ + ♻ ☆ Prompting Fairness: Integrating Causality to Debias Large Language + Models + + +
+ Large language models (LLMs), despite their remarkable capabilities, are +susceptible to generating biased and discriminatory responses. As LLMs +increasingly influence high-stakes decision-making (e.g., hiring and +healthcare), mitigating these biases becomes critical. In this work, we propose +a causality-guided debiasing framework to tackle social biases, aiming to +reduce the objectionable dependence between LLMs' decisions and the social +information in the input. Our framework introduces a novel perspective to +identify how social information can affect an LLM's decision through different +causal pathways. Leveraging these causal insights, we outline principled +prompting strategies that regulate these pathways through selection mechanisms. +This framework not only unifies existing prompting-based debiasing techniques, +but also opens up new directions for reducing bias by encouraging the model to +prioritize fact-based reasoning over reliance on biased social cues. We +validate our framework through extensive experiments on real-world datasets +across multiple domains, demonstrating its effectiveness in debiasing LLM +decisions, even with only black-box access to the model. + +
+
+ comment: 24 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Tri-Clustering: A Multi-views Tri-level Information Fusion Context + Clustering Framework for Localization and Classification in Mammography + + +
+ Breast cancer is a significant global health issue, and the diagnosis of +breast imaging has always been challenging. Mammography images typically have +extremely high resolution, with lesions occupying only a very small area. +Down-sampling in neural networks can easily lead to the loss of +microcalcifications or subtle structures, making it difficult for traditional +neural network architectures to address these issues. To tackle these +challenges, we propose a Context Clustering Network with triple information +fusion. Firstly, compared to CNNs or transformers, we find that Context +clustering methods (1) are more computationally efficient and (2) can more +easily associate structural or pathological features, making them suitable for +the clinical tasks of mammography. Secondly, we propose a triple information +fusion mechanism that integrates global information, feature-based local +information, and patch-based local information. The proposed approach is +rigorously evaluated on two public datasets, Vindr-Mammo and CBIS-DDSM, using +five independent splits to ensure statistical robustness. Our method achieves +an AUC of 0.828 on Vindr-Mammo and 0.805 on CBIS-DDSM, outperforming the next +best method by 3.1% and 2.4%, respectively. These improvements are +statistically significant (p<0.05), underscoring the benefits of Context +Clustering Network with triple information fusion. Overall, our Context +Clustering framework demonstrates strong potential as a scalable and +cost-effective solution for large-scale mammography screening, enabling more +efficient and accurate breast cancer detection. Access to our method is +available at https://github.com/Sohyu1/Mammo_Clustering. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Synthesizing Physically Plausible Human Motions in 3D Scenes 3DV 2024 + + +
+ We present a physics-based character control framework for synthesizing +human-scene interactions. Recent advances adopt physics simulation to mitigate +artifacts produced by data-driven kinematic approaches. However, existing +physics-based methods mainly focus on single-object environments, resulting in +limited applicability in realistic 3D scenes with multi-objects. To address +such challenges, we propose a framework that enables physically simulated +characters to perform long-term interaction tasks in diverse, cluttered, and +unseen 3D scenes. The key idea is to decouple human-scene interactions into two +fundamental processes, Interacting and Navigating, which motivates us to +construct two reusable Controllers, namely InterCon and NavCon. Specifically, +InterCon uses two complementary policies to enable characters to enter or leave +the interacting state with a particular object (e.g., sitting on a chair or +getting up). To realize navigation in cluttered environments, we introduce +NavCon, where a trajectory following policy enables characters to track +pre-planned collision-free paths. Benefiting from the divide and conquer +strategy, we can train all policies in simple environments and directly apply +them in complex multi-object scenes through coordination from a rule-based +scheduler. Video and code are available at +https://github.com/liangpan99/InterScene. + +
+
+ comment: 3DV 2024 version +
+
+
+
+
+ + ♻ ☆ Leveraging Dual Process Theory in Language Agent Framework for Real-time + Simultaneous Human-AI Collaboration + + +
+ Agents built on large language models (LLMs) have excelled in turn-by-turn +human-AI collaboration but struggle with simultaneous tasks requiring real-time +interaction. Latency issues and the challenge of inferring variable human +strategies hinder their ability to make autonomous decisions without explicit +instructions. Through experiments with current independent System 1 and System +2 methods, we validate the necessity of using Dual Process Theory (DPT) in +real-time tasks. We propose DPT-Agent, a novel language agent framework that +integrates System 1 and System 2 for efficient real-time simultaneous human-AI +collaboration. DPT-Agent's System 1 uses a Finite-state Machine (FSM) and +code-as-policy for fast, intuitive, and controllable decision-making. +DPT-Agent's System 2 integrates Theory of Mind (ToM) and asynchronous +reflection to infer human intentions and perform reasoning-based autonomous +decisions. We demonstrate the effectiveness of DPT-Agent through further +experiments with rule-based agents and human collaborators, showing significant +improvements over mainstream LLM-based frameworks. DPT-Agent can effectively +help LLMs convert correct slow thinking and reasoning into executable actions, +thereby improving performance. To the best of our knowledge, DPT-Agent is the +first language agent framework that achieves successful real-time simultaneous +human-AI collaboration autonomously. Code of DPT-Agent can be found in +https://github.com/sjtu-marl/DPT-Agent. + +
+
+ comment: Preprint under review. Update the experimental results of the + DeepSeek-R1 series models, o3-mini-high and o3-mini-medium +
+
+
+
+
+ + ♻ ☆ Unleashing the Potential of Vision-Language Pre-Training for 3D + Zero-Shot Lesion Segmentation via Mask-Attribute Alignment ICLR 2025 + + +
+ Recent advancements in medical vision-language pre-training models have +driven significant progress in zero-shot disease recognition. However, +transferring image-level knowledge to pixel-level tasks, such as lesion +segmentation in 3D CT scans, remains a critical challenge. Due to the +complexity and variability of pathological visual characteristics, existing +methods struggle to align fine-grained lesion features not encountered during +training with disease-related textual representations. In this paper, we +present Malenia, a novel multi-scale lesion-level mask-attribute alignment +framework, specifically designed for 3D zero-shot lesion segmentation. Malenia +improves the compatibility between mask representations and their associated +elemental attributes, explicitly linking the visual features of unseen lesions +with the extensible knowledge learned from previously seen ones. Furthermore, +we design a Cross-Modal Knowledge Injection module to enhance both visual and +textual features with mutually beneficial information, effectively guiding the +generation of segmentation results. Comprehensive experiments across three +datasets and 12 lesion categories validate the superior performance of Malenia. + +
+
+ comment: Accepted as ICLR 2025 conference paper +
+
+
+
+
+ + ♻ ☆ From Screens to Scenes: A Survey of Embodied AI in Healthcare + + +
+ Healthcare systems worldwide face persistent challenges in efficiency, +accessibility, and personalization. Powered by modern AI technologies such as +multimodal large language models and world models, Embodied AI (EmAI) +represents a transformative frontier, offering enhanced autonomy and the +ability to interact with the physical world to address these challenges. As an +interdisciplinary and rapidly evolving research domain, "EmAI in healthcare" +spans diverse fields such as algorithms, robotics, and biomedicine. This +complexity underscores the importance of timely reviews and analyses to track +advancements, address challenges, and foster cross-disciplinary collaboration. +In this paper, we provide a comprehensive overview of the "brain" of EmAI for +healthcare, wherein we introduce foundational AI algorithms for perception, +actuation, planning, and memory, and focus on presenting the healthcare +applications spanning clinical interventions, daily care & companionship, +infrastructure support, and biomedical research. Despite its promise, the +development of EmAI for healthcare is hindered by critical challenges such as +safety concerns, gaps between simulation platforms and real-world applications, +the absence of standardized benchmarks, and uneven progress across +interdisciplinary domains. We discuss the technical barriers and explore +ethical considerations, offering a forward-looking perspective on the future of +EmAI in healthcare. A hierarchical framework of intelligent levels for EmAI +systems is also introduced to guide further development. By providing +systematic insights, this work aims to inspire innovation and practical +applications, paving the way for a new era of intelligent, patient-centered +healthcare. + +
+
+ comment: 56 pages, 11 figures, manuscript accepted by Information Fusion +
+
+
+
+
+ + ♻ ☆ Utilizing ChatGPT in a Data Structures and Algorithms Course: A Teaching + Assistant's Perspective + + +
+ Integrating large language models (LLMs) like ChatGPT into computer science +education offers transformative potential for complex courses such as data +structures and algorithms (DSA). This study examines ChatGPT as a supplementary +tool for teaching assistants (TAs), guided by structured prompts and human +oversight, to enhance instruction and student outcomes. A controlled experiment +compared traditional TA-led instruction with a hybrid approach where TAs used +ChatGPT-4o and ChatGPT o1 to generate exercises, clarify concepts, and provide +feedback. Structured prompts emphasized problem decomposition, real-world +context, and code examples, enabling tailored support while mitigating +over-reliance on AI. Results demonstrated the hybrid approach's efficacy, with +students in the ChatGPT-assisted group scoring 16.50 points higher on average +and excelling in advanced topics. However, ChatGPT's limitations necessitated +TA verification. This framework highlights the dual role of LLMs: augmenting TA +efficiency while ensuring accuracy through human oversight, offering a scalable +solution for human-AI collaboration in education. + +
+
+ comment: Accepted at CHI EA '25 (Extended Abstracts of the CHI Conference on + Human Factors in Computing Systems, 2025). The final version is available at + the External DOI +
+
+
+
+
+ + ♻ ☆ TradingAgents: Multi-Agents LLM Financial Trading Framework AAAI 2025 + + +
+ Significant progress has been made in automated problem-solving using +societies of agents powered by large language models (LLMs). In finance, +efforts have largely focused on single-agent systems handling specific tasks or +multi-agent frameworks independently gathering data. However, multi-agent +systems' potential to replicate real-world trading firms' collaborative +dynamics remains underexplored. TradingAgents proposes a novel stock trading +framework inspired by trading firms, featuring LLM-powered agents in +specialized roles such as fundamental analysts, sentiment analysts, technical +analysts, and traders with varied risk profiles. The framework includes Bull +and Bear researcher agents assessing market conditions, a risk management team +monitoring exposure, and traders synthesizing insights from debates and +historical data to make informed decisions. By simulating a dynamic, +collaborative trading environment, this framework aims to improve trading +performance. Detailed architecture and extensive experiments reveal its +superiority over baseline models, with notable improvements in cumulative +returns, Sharpe ratio, and maximum drawdown, highlighting the potential of +multi-agent LLM frameworks in financial trading. TradingAgents is available at +https://github.com/PioneerFintech. + +
+
+ comment: Multi-Agent AI in the Real World @ AAAI 2025 +
+
+
+
+
+ + ♻ ☆ LLaVA-Mini: Efficient Image and Video Large Multimodal Models with One + Vision Token ICLR 2025 + + +
+ The advent of real-time large multimodal models (LMMs) like GPT-4o has +sparked considerable interest in efficient LMMs. LMM frameworks typically +encode visual inputs into vision tokens (continuous representations) and +integrate them and textual instructions into the context of large language +models (LLMs), where large-scale parameters and numerous context tokens +(predominantly vision tokens) result in substantial computational overhead. +Previous efforts towards efficient LMMs always focus on replacing the LLM +backbone with smaller models, while neglecting the crucial issue of token +quantity. In this paper, we introduce LLaVA-Mini, an efficient LMM with minimal +vision tokens. To achieve a high compression ratio of vision tokens while +preserving visual information, we first analyze how LMMs understand vision +tokens and find that most vision tokens only play a crucial role in the early +layers of LLM backbone, where they mainly fuse visual information into text +tokens. Building on this finding, LLaVA-Mini introduces modality pre-fusion to +fuse visual information into text tokens in advance, thereby facilitating the +extreme compression of vision tokens fed to LLM backbone into one token. +LLaVA-Mini is a unified large multimodal model that can support the +understanding of images, high-resolution images, and videos in an efficient +manner. Experiments across 11 image-based and 7 video-based benchmarks +demonstrate that LLaVA-Mini outperforms LLaVA-v1.5 with just 1 vision token +instead of 576. Efficiency analyses reveal that LLaVA-Mini can reduce FLOPs by +77%, deliver low-latency responses within 40 milliseconds, and process over +10,000 frames of video on the GPU hardware with 24GB of memory. + +
+
+ comment: Accepted to ICLR 2025. Code: https://github.com/ictnlp/LLaVA-Mini + Model: https://huggingface.co/ICTNLP/llava-mini-llama-3.1-8b +
+
+
+
+
+ + ♻ ☆ GAMED: Knowledge Adaptive Multi-Experts Decoupling for Multimodal Fake + News Detection + + +
+ Multimodal fake news detection often involves modelling heterogeneous data +sources, such as vision and language. Existing detection methods typically rely +on fusion effectiveness and cross-modal consistency to model the content, +complicating understanding how each modality affects prediction accuracy. +Additionally, these methods are primarily based on static feature modelling, +making it difficult to adapt to the dynamic changes and relationships between +different data modalities. This paper develops a significantly novel approach, +GAMED, for multimodal modelling, which focuses on generating distinctive and +discriminative features through modal decoupling to enhance cross-modal +synergies, thereby optimizing overall performance in the detection process. +GAMED leverages multiple parallel expert networks to refine features and +pre-embed semantic knowledge to improve the experts' ability in information +selection and viewpoint sharing. Subsequently, the feature distribution of each +modality is adaptively adjusted based on the respective experts' opinions. +GAMED also introduces a novel classification technique to dynamically manage +contributions from different modalities, while improving the explainability of +decisions. Experimental results on the Fakeddit and Yang datasets demonstrate +that GAMED performs better than recently developed state-of-the-art models. The +source code can be accessed at https://github.com/slz0925/GAMED. + +
+
+
+
+
+ + ♻ ☆ Monet: Mixture of Monosemantic Experts for Transformers + + +
+ Understanding the internal computations of large language models (LLMs) is +crucial for aligning them with human values and preventing undesirable +behaviors like toxic content generation. However, mechanistic interpretability +is hindered by polysemanticity -- where individual neurons respond to multiple, +unrelated concepts. While Sparse Autoencoders (SAEs) have attempted to +disentangle these features through sparse dictionary learning, they have +compromised LLM performance due to reliance on post-hoc reconstruction loss. To +address this issue, we introduce Mixture of Monosemantic Experts for +Transformers (Monet) architecture, which incorporates sparse dictionary +learning directly into end-to-end Mixture-of-Experts pretraining. Our novel +expert decomposition method enables scaling the expert count to 262,144 per +layer while total parameters scale proportionally to the square root of the +number of experts. Our analyses demonstrate mutual exclusivity of knowledge +across experts and showcase the parametric knowledge encapsulated within +individual experts. Moreover, Monet allows knowledge manipulation over domains, +languages, and toxicity mitigation without degrading general performance. Our +pursuit of transparent LLMs highlights the potential of scaling expert counts +to enhance mechanistic interpretability and directly resect the internal +knowledge to fundamentally adjust model behavior. The source code and +pretrained checkpoints are available at https://github.com/dmis-lab/Monet. + +
+
+
+
+
+ + ♻ ☆ When Attention Sink Emerges in Language Models: An Empirical View ICLR 2025 + + +
+ Language Models (LMs) assign significant attention to the first token, even +if it is not semantically important, which is known as attention sink. This +phenomenon has been widely adopted in applications such as streaming/long +context generation, KV cache optimization, inference acceleration, model +quantization, and others. Despite its widespread use, a deep understanding of +attention sink in LMs is still lacking. In this work, we first demonstrate that +attention sinks exist universally in LMs with various inputs, even in small +models. Furthermore, attention sink is observed to emerge during the LM +pre-training, motivating us to investigate how optimization, data distribution, +loss function, and model architecture in LM pre-training influence its +emergence. We highlight that attention sink emerges after effective +optimization on sufficient training data. The sink position is highly +correlated with the loss function and data distribution. Most importantly, we +find that attention sink acts more like key biases, storing extra attention +scores, which could be non-informative and not contribute to the value +computation. We also observe that this phenomenon (at least partially) stems +from tokens' inner dependence on attention scores as a result of softmax +normalization. After relaxing such dependence by replacing softmax attention +with other attention operations, such as sigmoid attention without +normalization, attention sinks do not emerge in LMs up to 1B parameters. The +code is available at https://github.com/sail-sg/Attention-Sink. + +
+
+ comment: ICLR 2025 (Spotlight) +
+
+
+
+
+ + ♻ ☆ Generating Visual Stories with Grounded and Coreferent Characters + + +
+ Characters are important in narratives. They move the plot forward, create +emotional connections, and embody the story's themes. Visual storytelling +methods focus more on the plot and events relating to it, without building the +narrative around specific characters. As a result, the generated stories feel +generic, with character mentions being absent, vague, or incorrect. To mitigate +these issues, we introduce the new task of character-centric story generation +and present the first model capable of predicting visual stories with +consistently grounded and coreferent character mentions. Our model is finetuned +on a new dataset which we build on top of the widely used VIST benchmark. +Specifically, we develop an automated pipeline to enrich VIST with visual and +textual character coreference chains. We also propose new evaluation metrics to +measure the richness of characters and coreference in stories. Experimental +results show that our model generates stories with recurring characters which +are consistent and coreferent to larger extent compared to baselines and +state-of-the-art systems. + +
+
+
+
+
+ + ♻ ☆ Cheating Automatic LLM Benchmarks: Null Models Achieve High Win Rates ICLR 2025 + + +
+ Automatic LLM benchmarks, such as AlpacaEval 2.0, Arena-Hard-Auto, and +MT-Bench, have become popular for evaluating language models due to their +cost-effectiveness and scalability compared to human evaluation. Achieving high +win rates on these benchmarks can significantly boost the promotional impact of +newly released language models. This promotional benefit may motivate tricks, +such as manipulating model output length or style to game win rates, even +though several mechanisms have been developed to control length and disentangle +style to reduce gameability. Nonetheless, we show that even a "null model" that +always outputs a constant response (irrelevant to input instructions) can cheat +automatic benchmarks and achieve top-ranked win rates: an 86.5% LC win rate on +AlpacaEval 2.0; an 83.0 score on Arena-Hard-Auto; and a 9.55 score on MT-Bench. +Moreover, the crafted cheating outputs are transferable because we assume that +the instructions of these benchmarks (e.g., 805 samples of AlpacaEval 2.0) are +private and cannot be accessed. While our experiments are primarily +proof-of-concept, an adversary could use LLMs to generate more imperceptible +cheating responses, unethically benefiting from high win rates and promotional +impact. Our findings call for the development of anti-cheating mechanisms for +reliable automatic benchmarks. The code is available at +https://github.com/sail-sg/Cheating-LLM-Benchmarks. + +
+
+ comment: ICLR 2025 (Oral) +
+
+
+
+
+ + ♻ ☆ Graph Transformers Dream of Electric Flow + + +
+ We show theoretically and empirically that the linear Transformer, when +applied to graph data, can implement algorithms that solve canonical problems +such as electric flow and eigenvector decomposition. The Transformer has access +to information on the input graph only via the graph's incidence matrix. We +present explicit weight configurations for implementing each algorithm, and we +bound the constructed Transformers' errors by the errors of the underlying +algorithms. Our theoretical findings are corroborated by experiments on +synthetic data. Additionally, on a real-world molecular regression task, we +observe that the linear Transformer is capable of learning a more effective +positional encoding than the default one based on Laplacian eigenvectors. Our +work is an initial step towards elucidating the inner-workings of the +Transformer for graph data. Code is available at +https://github.com/chengxiang/LinearGraphTransformer + +
+
+
+
+
+ + ♻ ☆ Tracking objects that change in appearance with phase synchrony + + +
+ Objects we encounter often change appearance as we interact with them. +Changes in illumination (shadows), object pose, or the movement of non-rigid +objects can drastically alter available image features. How do biological +visual systems track objects as they change? One plausible mechanism involves +attentional mechanisms for reasoning about the locations of objects +independently of their appearances -- a capability that prominent neuroscience +theories have associated with computing through neural synchrony. Here, we +describe a novel deep learning circuit that can learn to precisely control +attention to features separately from their location in the world through +neural synchrony: the complex-valued recurrent neural network (CV-RNN). Next, +we compare object tracking in humans, the CV-RNN, and other deep neural +networks (DNNs), using FeatureTracker: a large-scale challenge that asks +observers to track objects as their locations and appearances change in +precisely controlled ways. While humans effortlessly solved FeatureTracker, +state-of-the-art DNNs did not. In contrast, our CV-RNN behaved similarly to +humans on the challenge, providing a computational proof-of-concept for the +role of phase synchronization as a neural substrate for tracking +appearance-morphing objects as they move about. + +
+
+
+
+
+ + ♻ ☆ An Effective Automated Speaking Assessment Approach to Mitigating Data + Scarcity and Imbalanced Distribution NAACL 2024 + + +
+ Automated speaking assessment (ASA) typically involves automatic speech +recognition (ASR) and hand-crafted feature extraction from the ASR transcript +of a learner's speech. Recently, self-supervised learning (SSL) has shown +stellar performance compared to traditional methods. However, SSL-based ASA +systems are faced with at least three data-related challenges: limited +annotated data, uneven distribution of learner proficiency levels and +non-uniform score intervals between different CEFR proficiency levels. To +address these challenges, we explore the use of two novel modeling strategies: +metric-based classification and loss reweighting, leveraging distinct SSL-based +embedding features. Extensive experimental results on the ICNALE benchmark +dataset suggest that our approach can outperform existing strong baselines by a +sizable margin, achieving a significant improvement of more than 10% in CEFR +prediction accuracy. + +
+
+ comment: Accepted to NAACL 2024 Findings +
+
+
+
+
+ + ♻ ☆ DIPSER: A Dataset for In-Person Student Engagement Recognition in the + Wild + + +
+ In this paper, a novel dataset is introduced, designed to assess student +attention within in-person classroom settings. This dataset encompasses RGB +camera data, featuring multiple cameras per student to capture both posture and +facial expressions, in addition to smartwatch sensor data for each individual. +This dataset allows machine learning algorithms to be trained to predict +attention and correlate it with emotion. A comprehensive suite of attention and +emotion labels for each student is provided, generated through self-reporting +as well as evaluations by four different experts. Our dataset uniquely combines +facial and environmental camera data, smartwatch metrics, and includes +underrepresented ethnicities in similar datasets, all within in-the-wild, +in-person settings, making it the most comprehensive dataset of its kind +currently available. + The dataset presented offers an extensive and diverse collection of data +pertaining to student interactions across different educational contexts, +augmented with additional metadata from other tools. This initiative addresses +existing deficiencies by offering a valuable resource for the analysis of +student attention and emotion in face-to-face lessons. + +
+
+
+
+
+ + ♻ ☆ MOVE: Effective and Harmless Ownership Verification via Embedded + External Features AAAI 2022 + + +
+ Currently, deep neural networks (DNNs) are widely adopted in different +applications. Despite its commercial values, training a well-performing DNN is +resource-consuming. Accordingly, the well-trained model is valuable +intellectual property for its owner. However, recent studies revealed the +threats of model stealing, where the adversaries can obtain a function-similar +copy of the victim model, even when they can only query the model. In this +paper, we propose an effective and harmless model ownership verification (MOVE) +to defend against different types of model stealing simultaneously, without +introducing new security risks. In general, we conduct the ownership +verification by verifying whether a suspicious model contains the knowledge of +defender-specified external features. Specifically, we embed the external +features by modifying a few training samples with style transfer. We then train +a meta-classifier to determine whether a model is stolen from the victim. This +approach is inspired by the understanding that the stolen models should contain +the knowledge of features learned by the victim model. In particular, +\revision{we develop our MOVE method under both white-box and black-box +settings and analyze its theoretical foundation to provide comprehensive model +protection.} Extensive experiments on benchmark datasets verify the +effectiveness of our method and its resistance to potential adaptive attacks. +The codes for reproducing the main experiments of our method are available at +https://github.com/THUYimingLi/MOVE. + +
+
+ comment: This paper has been accepted by IEEE TPAMI 2025. It is the journal + extension of our conference paper in AAAI 2022 + (https://ojs.aaai.org/index.php/AAAI/article/view/20036). 18 pages +
+
+
+
+
+ + ♻ ☆ Permutation-Invariant Graph Partitioning:How Graph Neural Networks + Capture Structural Interactions? + + +
+ Graph Neural Networks (GNNs) have paved the way for being a cornerstone in +graph-related learning tasks. Yet, the ability of GNNs to capture structural +interactions within graphs remains under-explored. In this work, we address +this gap by drawing on the insight that permutation invariant graph +partitioning enables a powerful way of exploring structural interactions. We +establish theoretical connections between permutation invariant graph +partitioning and graph isomorphism, and then propose Graph Partitioning Neural +Networks (GPNNs), a novel architecture that efficiently enhances the expressive +power of GNNs in learning structural interactions. We analyze how partitioning +schemes and structural interactions contribute to GNN expressivity and their +trade-offs with complexity. Empirically, we demonstrate that GPNNs outperform +existing GNN models in capturing structural interactions across diverse graph +benchmark tasks. + +
+
+
+
+
+ + ♻ ☆ Boosting Jailbreak Attack with Momentum ICASSP 2025 + + +
+ Large Language Models (LLMs) have achieved remarkable success across diverse +tasks, yet they remain vulnerable to adversarial attacks, notably the +well-known jailbreak attack. In particular, the Greedy Coordinate Gradient +(GCG) attack has demonstrated efficacy in exploiting this vulnerability by +optimizing adversarial prompts through a combination of gradient heuristics and +greedy search. However, the efficiency of this attack has become a bottleneck +in the attacking process. To mitigate this limitation, in this paper we rethink +the generation of the adversarial prompts through an optimization lens, aiming +to stabilize the optimization process and harness more heuristic insights from +previous optimization iterations. Specifically, we propose the +\textbf{M}omentum \textbf{A}ccelerated G\textbf{C}G (\textbf{MAC}) attack, +which integrates a momentum term into the gradient heuristic to boost and +stabilize the random search for tokens in adversarial prompts. Experimental +results showcase the notable enhancement achieved by MAC over baselines in +terms of attack success rate and optimization efficiency. Moreover, we +demonstrate that MAC can still exhibit superior performance for transfer +attacks and models under defense mechanisms. Our code is available at +https://github.com/weizeming/momentum-attack-llm. + +
+
+ comment: Accepted by ICASSP 2025 +
+
+
+
+
+ + ♻ ☆ PIG: Physics-Informed Gaussians as Adaptive Parametric Mesh + Representations + + +
+ The numerical approximation of partial differential equations (PDEs) using +neural networks has seen significant advancements through Physics-Informed +Neural Networks (PINNs). Despite their straightforward optimization framework +and flexibility in implementing various PDEs, PINNs often suffer from limited +accuracy due to the spectral bias of Multi-Layer Perceptrons (MLPs), which +struggle to effectively learn high-frequency and nonlinear components. +Recently, parametric mesh representations in combination with neural networks +have been investigated as a promising approach to eliminate the inductive bias +of MLPs. However, they usually require high-resolution grids and a large number +of collocation points to achieve high accuracy while avoiding overfitting. In +addition, the fixed positions of the mesh parameters restrict their +flexibility, making accurate approximation of complex PDEs challenging. To +overcome these limitations, we propose Physics-Informed Gaussians (PIGs), which +combine feature embeddings using Gaussian functions with a lightweight neural +network. Our approach uses trainable parameters for the mean and variance of +each Gaussian, allowing for dynamic adjustment of their positions and shapes +during training. This adaptability enables our model to optimally approximate +PDE solutions, unlike models with fixed parameter positions. Furthermore, the +proposed approach maintains the same optimization framework used in PINNs, +allowing us to benefit from their excellent properties. Experimental results +show the competitive performance of our model across various PDEs, +demonstrating its potential as a robust tool for solving complex PDEs. Our +project page is available at +https://namgyukang.github.io/Physics-Informed-Gaussians/ + +
+
+ comment: Project page: + https://namgyukang.github.io/Physics-Informed-Gaussians/ +
+
+
+
+
+ + ♻ ☆ Breaking the Reclustering Barrier in Centroid-based Deep Clustering ICLR 2025 + + +
+ This work investigates an important phenomenon in centroid-based deep +clustering (DC) algorithms: Performance quickly saturates after a period of +rapid early gains. Practitioners commonly address early saturation with +periodic reclustering, which we demonstrate to be insufficient to address +performance plateaus. We call this phenomenon the "reclustering barrier" and +empirically show when the reclustering barrier occurs, what its underlying +mechanisms are, and how it is possible to Break the Reclustering Barrier with +our algorithm BRB. BRB avoids early over-commitment to initial clusterings and +enables continuous adaptation to reinitialized clustering targets while +remaining conceptually simple. Applying our algorithm to widely-used +centroid-based DC algorithms, we show that (1) BRB consistently improves +performance across a wide range of clustering benchmarks, (2) BRB enables +training from scratch, and (3) BRB performs competitively against +state-of-the-art DC algorithms when combined with a contrastive loss. We +release our code and pre-trained models at +https://github.com/Probabilistic-and-Interactive-ML/breaking-the-reclustering-barrier . + +
+
+ comment: Accepted at ICLR 2025 (Camera-ready version) +
+
+
+
+
+ + ♻ ☆ Greener GRASS: Enhancing GNNs with Encoding, Rewiring, and Attention ICLR 2025 + + +
+ Graph Neural Networks (GNNs) have become important tools for machine learning +on graph-structured data. In this paper, we explore the synergistic combination +of graph encoding, graph rewiring, and graph attention, by introducing Graph +Attention with Stochastic Structures (GRASS), a novel GNN architecture. GRASS +utilizes relative random walk probabilities (RRWP) encoding and a novel +decomposed variant (D-RRWP) to efficiently capture structural information. It +rewires the input graph by superimposing a random regular graph to enhance +long-range information propagation. It also employs a novel additive attention +mechanism tailored for graph-structured data. Our empirical evaluations +demonstrate that GRASS achieves state-of-the-art performance on multiple +benchmark datasets, including a 20.3% reduction in mean absolute error on the +ZINC dataset. + +
+
+ comment: Published as a conference paper at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Robust Weight Initialization for Tanh Neural Networks with Fixed Point + Analysis ICLR 2025 + + +
+ As a neural network's depth increases, it can improve generalization +performance. However, training deep networks is challenging due to gradient and +signal propagation issues. To address these challenges, extensive theoretical +research and various methods have been introduced. Despite these advances, +effective weight initialization methods for tanh neural networks remain +insufficiently investigated. This paper presents a novel weight initialization +method for neural networks with tanh activation function. Based on an analysis +of the fixed points of the function $\tanh(ax)$, the proposed method aims to +determine values of $a$ that mitigate activation saturation. A series of +experiments on various classification datasets and physics-informed neural +networks demonstrates that the proposed method outperforms Xavier +initialization methods~(with or without normalization) in terms of robustness +across different network sizes, data efficiency, and convergence speed. Code is +available at https://github.com/1HyunwooLee/Tanh-Init + +
+
+ comment: ICLR 2025 +
+
+
+
+
+ + ♻ ☆ StochSync: Stochastic Diffusion Synchronization for Image Generation in + Arbitrary Spaces ICLR 2025 + + +
+ We propose a zero-shot method for generating images in arbitrary spaces +(e.g., a sphere for 360{\deg} panoramas and a mesh surface for texture) using a +pretrained image diffusion model. The zero-shot generation of various visual +content using a pretrained image diffusion model has been explored mainly in +two directions. First, Diffusion Synchronization-performing reverse diffusion +processes jointly across different projected spaces while synchronizing them in +the target space-generates high-quality outputs when enough conditioning is +provided, but it struggles in its absence. Second, Score Distillation +Sampling-gradually updating the target space data through gradient +descent-results in better coherence but often lacks detail. In this paper, we +reveal for the first time the interconnection between these two methods while +highlighting their differences. To this end, we propose StochSync, a novel +approach that combines the strengths of both, enabling effective performance +with weak conditioning. Our experiments demonstrate that StochSync provides the +best performance in 360{\deg} panorama generation (where image conditioning is +not given), outperforming previous finetuning-based methods, and also delivers +comparable results in 3D mesh texturing (where depth conditioning is provided) +with previous methods. + +
+
+ comment: Project page: https://stochsync.github.io/ (ICLR 2025) +
+
+
+
+
+ + ♻ ☆ Training-Free Message Passing for Learning on Hypergraphs + + +
+ Hypergraphs are crucial for modelling higher-order interactions in real-world +data. Hypergraph neural networks (HNNs) effectively utilise these structures by +message passing to generate informative node features for various downstream +tasks like node classification. However, the message passing module in existing +HNNs typically requires a computationally intensive training process, which +limits their practical use. To tackle this challenge, we propose an alternative +approach by decoupling the usage of hypergraph structural information from the +model learning stage. This leads to a novel training-free message passing +module, named TF-MP-Module, which can be precomputed in the data preprocessing +stage, thereby reducing the computational burden. We refer to the hypergraph +neural network equipped with our TF-MP-Module as TF-HNN. We theoretically +support the efficiency and effectiveness of TF-HNN by showing that: 1) It is +more training-efficient compared to existing HNNs; 2) It utilises as much +information as existing HNNs for node feature generation; and 3) It is robust +against the oversmoothing issue while using long-range interactions. +Experiments based on seven real-world hypergraph benchmarks in node +classification and hyperlink prediction show that, compared to state-of-the-art +HNNs, TF-HNN exhibits both competitive performance and superior training +efficiency. Specifically, on the large-scale benchmark, Trivago, TF-HNN +outperforms the node classification accuracy of the best baseline by 10% with +just 1% of the training time of that baseline. + +
+
+
+
+
+ + ♻ ☆ DiscoGraMS: Enhancing Movie Screen-Play Summarization using Movie + Character-Aware Discourse Graph NAACL 2025 + + +
+ Summarizing movie screenplays presents a unique set of challenges compared to +standard document summarization. Screenplays are not only lengthy, but also +feature a complex interplay of characters, dialogues, and scenes, with numerous +direct and subtle relationships and contextual nuances that are difficult for +machine learning models to accurately capture and comprehend. Recent attempts +at screenplay summarization focus on fine-tuning transformer-based pre-trained +models, but these models often fall short in capturing long-term dependencies +and latent relationships, and frequently encounter the "lost in the middle" +issue. To address these challenges, we introduce DiscoGraMS, a novel resource +that represents movie scripts as a movie character-aware discourse graph (CaD +Graph). This approach is well-suited for various downstream tasks, such as +summarization, question-answering, and salience detection. The model aims to +preserve all salient information, offering a more comprehensive and faithful +representation of the screenplay's content. We further explore a baseline +method that combines the CaD Graph with the corresponding movie script through +a late fusion of graph and text modalities, and we present very initial +promising results. + +
+
+ comment: Accepted at NAACL 2025 (Main) +
+
+
+
+
+ + ♻ ☆ Dist Loss: Enhancing Regression in Few-Shot Region through Distribution + Distance Constraint + + +
+ Imbalanced data distributions are prevalent in real-world scenarios, posing +significant challenges in both imbalanced classification and imbalanced +regression tasks. They often cause deep learning models to overfit in areas of +high sample density (many-shot regions) while underperforming in areas of low +sample density (few-shot regions). This characteristic restricts the utility of +deep learning models in various sectors, notably healthcare, where areas with +few-shot data hold greater clinical relevance. While recent studies have shown +the benefits of incorporating distribution information in imbalanced +classification tasks, such strategies are rarely explored in imbalanced +regression. In this paper, we address this issue by introducing a novel loss +function, termed Dist Loss, designed to minimize the distribution distance +between the model's predictions and the target labels in a differentiable +manner, effectively integrating distribution information into model training. +Dist Loss enables deep learning models to regularize their output distribution +during training, effectively enhancing their focus on few-shot regions. We have +conducted extensive experiments across three datasets spanning computer vision +and healthcare: IMDB-WIKI-DIR, AgeDB-DIR, and ECG-Ka-DIR. The results +demonstrate that Dist Loss effectively mitigates the negative impact of +imbalanced data distribution on model performance, achieving state-of-the-art +results in sparse data regions. Furthermore, Dist Loss is easy to integrate, +complementing existing methods. + +
+
+
+
+
+ + ♻ ☆ CLIPure: Purification in Latent Space via CLIP for Adversarially Robust + Zero-Shot Classification ICLR 2025 + + +
+ In this paper, we aim to build an adversarially robust zero-shot image +classifier. We ground our work on CLIP, a vision-language pre-trained encoder +model that can perform zero-shot classification by matching an image with text +prompts ``a photo of a .''. Purification is the path we choose +since it does not require adversarial training on specific attack types and +thus can cope with any foreseen attacks. We then formulate purification risk as +the KL divergence between the joint distributions of the purification process +of denoising the adversarial samples and the attack process of adding +perturbations to benign samples, through bidirectional Stochastic Differential +Equations (SDEs). The final derived results inspire us to explore purification +in the multi-modal latent space of CLIP. We propose two variants for our +CLIPure approach: CLIPure-Diff which models the likelihood of images' latent +vectors with the DiffusionPrior module in DaLLE-2 (modeling the generation +process of CLIP's latent vectors), and CLIPure-Cos which models the likelihood +with the cosine similarity between the embeddings of an image and ``a photo of +a.''. As far as we know, CLIPure is the first purification method in +multi-modal latent space and CLIPure-Cos is the first purification method that +is not based on generative models, which substantially improves defense +efficiency. We conducted extensive experiments on CIFAR-10, ImageNet, and 13 +datasets that previous CLIP-based defense methods used for evaluating zero-shot +classification robustness. Results show that CLIPure boosts the SOTA robustness +by a large margin, e.g., from 71.7% to 91.1% on CIFAR10, from 59.6% to 72.6% on +ImageNet, and 108% relative improvements of average robustness on the 13 +datasets over previous SOTA. The code is available at +https://github.com/TMLResearchGroup-CAS/CLIPure. + +
+
+ comment: accepted by ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Path-Consistency: Prefix Enhancement for Efficient Inference in LLM + + +
+ To enhance the reasoning capabilities of large language models (LLMs), +self-consistency has gained significant popularity by combining multiple +sampling with majority voting. However, the state-of-the-art self-consistency +approaches consume substantial computational resources and lead to significant +additional time costs due to the multiple sampling. This prevents its full +potential from being realized in scenarios where computational resources are +critical. To improve the inference efficiency, this paper introduces +\textit{path-consistency}, a method that leverages the confidence of answers +generated in earlier branches to identify the prefix of the most promising +path. By dynamically guiding the generation of subsequent branches based on +this prefix, the \textit{path-consistency} mitigates both the errors and +redundancies from random or less useful sampling in self-consistency. As a +result, it can significantly accelerate the inference process by reducing the +number of tokens generated. Our extensive empirical evaluation shows that the +\textit{path-consistency} achieves significant acceleration in inference +latency ranging from $7.8\%$ to $40.5\%$, while maintaining or even improving +task accuracy across different datasets, including mathematical reasoning, +common sense reasoning, symbolic reasoning, and code generation. + +
+
+
+
+
+ + ♻ ☆ Pair-VPR: Place-Aware Pre-training and Contrastive Pair Classification + for Visual Place Recognition with Vision Transformers + + +
+ In this work we propose a novel joint training method for Visual Place +Recognition (VPR), which simultaneously learns a global descriptor and a pair +classifier for re-ranking. The pair classifier can predict whether a given pair +of images are from the same place or not. The network only comprises Vision +Transformer components for both the encoder and the pair classifier, and both +components are trained using their respective class tokens. In existing VPR +methods, typically the network is initialized using pre-trained weights from a +generic image dataset such as ImageNet. In this work we propose an alternative +pre-training strategy, by using Siamese Masked Image Modelling as a +pre-training task. We propose a Place-aware image sampling procedure from a +collection of large VPR datasets for pre-training our model, to learn visual +features tuned specifically for VPR. By re-using the Mask Image Modelling +encoder and decoder weights in the second stage of training, Pair-VPR can +achieve state-of-the-art VPR performance across five benchmark datasets with a +ViT-B encoder, along with further improvements in localization recall with +larger encoders. The Pair-VPR website is: +https://csiro-robotics.github.io/Pair-VPR. + +
+
+
+
+
+ + ♻ ☆ Fairness in Agentic AI: A Unified Framework for Ethical and Equitable + Multi-Agent System + + +
+ Ensuring fairness in decentralized multi-agent systems presents significant +challenges due to emergent biases, systemic inefficiencies, and conflicting +agent incentives. This paper provides a comprehensive survey of fairness in +multi-agent AI, introducing a novel framework where fairness is treated as a +dynamic, emergent property of agent interactions. The framework integrates +fairness constraints, bias mitigation strategies, and incentive mechanisms to +align autonomous agent behaviors with societal values while balancing +efficiency and robustness. Through empirical validation, we demonstrate that +incorporating fairness constraints results in more equitable decision-making. +This work bridges the gap between AI ethics and system design, offering a +foundation for accountable, transparent, and socially responsible multi-agent +AI systems. + +
+
+ comment: 12 pages, 4 figures, 1 table +
+
+
+
+
+ + ♻ ☆ WalnutData: A UAV Remote Sensing Dataset of Green Walnuts and Model + Evaluation + + +
+ The UAV technology is gradually maturing and can provide extremely powerful +support for smart agriculture and precise monitoring. Currently, there is no +dataset related to green walnuts in the field of agricultural computer vision. +Thus, in order to promote the algorithm design in the field of agricultural +computer vision, we used UAV to collect remote-sensing data from 8 walnut +sample plots. Considering that green walnuts are subject to various lighting +conditions and occlusion, we constructed a large-scale dataset with a +higher-granularity of target features - WalnutData. This dataset contains a +total of 30,240 images and 706,208 instances, and there are 4 target +categories: being illuminated by frontal light and unoccluded (A1), being +backlit and unoccluded (A2), being illuminated by frontal light and occluded +(B1), and being backlit and occluded (B2). Subsequently, we evaluated many +mainstream algorithms on WalnutData and used these evaluation results as the +baseline standard. The dataset and all evaluation results can be obtained at +https://github.com/1wuming/WalnutData. + +
+
+
+
+
+ + ♻ ☆ High-Resolution Image Synthesis via Next-Token Prediction + + +
+ Recently, autoregressive models have demonstrated remarkable performance in +class-conditional image generation. However, the application of next-token +prediction to high-resolution text-to-image generation remains largely +unexplored. In this paper, we introduce \textbf{D-JEPA$\cdot$T2I}, an +autoregressive model based on continuous tokens that incorporates innovations +in both architecture and training strategy to generate high-quality, +photorealistic images at arbitrary resolutions, up to 4K. Architecturally, we +adopt the denoising joint embedding predictive architecture (D-JEPA) while +leveraging a multimodal visual transformer to effectively integrate textual and +visual features. Additionally, we introduce flow matching loss alongside the +proposed Visual Rotary Positional Embedding (VoPE) to enable continuous +resolution learning. In terms of training strategy, we propose a data feedback +mechanism that dynamically adjusts the sampling procedure based on statistical +analysis and an online learning critic model. This encourages the model to move +beyond its comfort zone, reducing redundant training on well-mastered scenarios +and compelling it to address more challenging cases with suboptimal generation +quality. For the first time, we achieve state-of-the-art high-resolution image +synthesis via next-token prediction. + +
+
+ comment: 31 pages +
+
+
+
+
+ + ♻ ☆ Mixture-of-Subspaces in Low-Rank Adaptation EMNLP 2024 + + +
+ In this paper, we introduce a subspace-inspired Low-Rank Adaptation (LoRA) +method, which is computationally efficient, easy to implement, and readily +applicable to large language, multimodal, and diffusion models. Initially, we +equivalently decompose the weights of LoRA into two subspaces, and find that +simply mixing them can enhance performance. To study such a phenomenon, we +revisit it through a fine-grained subspace lens, showing that such modification +is equivalent to employing a fixed mixer to fuse the subspaces. To be more +flexible, we jointly learn the mixer with the original LoRA weights, and term +the method Mixture-of-Subspaces LoRA (MoSLoRA). MoSLoRA consistently +outperforms LoRA on tasks in different modalities, including commonsense +reasoning, visual instruction tuning, and subject-driven text-to-image +generation, demonstrating its effectiveness and robustness. Codes are available +at https://github.com/wutaiqiang/MoSLoRA. + +
+
+ comment: EMNLP 2024 Main, Oral +
+
+
+
+
+ + ♻ ☆ Empathy Level Alignment via Reinforcement Learning for Empathetic + Response Generation + + +
+ Empathetic response generation, aiming to understand the user's situation and +feelings and respond empathically, is crucial in building human-like dialogue +systems. Traditional approaches typically employ maximum likelihood estimation +as the optimization objective during training, yet fail to align the empathy +levels between generated and target responses. To this end, we propose an +empathetic response generation framework using reinforcement learning (EmpRL). +The framework develops an effective empathy reward function and generates +empathetic responses by maximizing the expected reward through reinforcement +learning. EmpRL utilizes the pre-trained T5 model as the generator and further +fine-tunes it to initialize the policy. To align the empathy levels between +generated and target responses within a given context, an empathy reward +function containing three empathy communication mechanisms -- emotional +reaction, interpretation, and exploration -- is constructed using pre-designed +and pre-trained empathy identifiers. During reinforcement learning training, +the proximal policy optimization algorithm is used to fine-tune the policy, +enabling the generation of empathetic responses. Both automatic and human +evaluations demonstrate that the proposed EmpRL framework significantly +improves the quality of generated responses, enhances the similarity in empathy +levels between generated and target responses, and produces empathetic +responses covering both affective and cognitive aspects. + +
+
+ comment: Accepted by IEEE Transactions on Affective Computing +
+
+
+
+
+ + ♻ ☆ Tackling Data Corruption in Offline Reinforcement Learning via Sequence + Modeling ICLR2025 + + +
+ Learning policy from offline datasets through offline reinforcement learning +(RL) holds promise for scaling data-driven decision-making while avoiding +unsafe and costly online interactions. However, real-world data collected from +sensors or humans often contains noise and errors, posing a significant +challenge for existing offline RL methods, particularly when the real-world +data is limited. Our study reveals that prior research focusing on adapting +predominant offline RL methods based on temporal difference learning still +falls short under data corruption when the dataset is limited. In contrast, we +discover that vanilla sequence modeling methods, such as Decision Transformer, +exhibit robustness against data corruption, even without specialized +modifications. To unlock the full potential of sequence modeling, we propose +Robust Decision Rransformer (RDT) by incorporating three simple yet effective +robust techniques: embedding dropout to improve the model's robustness against +erroneous inputs, Gaussian weighted learning to mitigate the effects of +corrupted labels, and iterative data correction to eliminate corrupted data +from the source. Extensive experiments on MuJoCo, Kitchen, and Adroit tasks +demonstrate RDT's superior performance under various data corruption scenarios +compared to prior methods. Furthermore, RDT exhibits remarkable robustness in a +more challenging setting that combines training-time data corruption with +test-time observation perturbations. These results highlight the potential of +sequence modeling for learning from noisy or corrupted offline datasets, +thereby promoting the reliable application of offline RL in real-world +scenarios. Our code is available at +https://github.com/jiawei415/RobustDecisionTransformer. + +
+
+ comment: Accepted by ICLR2025 +
+
+
+
+
+ + ♻ ☆ Efficient Automated Circuit Discovery in Transformers using Contextual + Decomposition + + +
+ Automated mechanistic interpretation research has attracted great interest +due to its potential to scale explanations of neural network internals to large +models. Existing automated circuit discovery work relies on activation patching +or its approximations to identify subgraphs in models for specific tasks +(circuits). They often suffer from slow runtime, approximation errors, and +specific requirements of metrics, such as non-zero gradients. In this work, we +introduce contextual decomposition for transformers (CD-T) to build +interpretable circuits in large language models. CD-T can produce circuits of +arbitrary level of abstraction, and is the first able to produce circuits as +fine-grained as attention heads at specific sequence positions efficiently. +CD-T consists of a set of mathematical equations to isolate contribution of +model features. Through recursively computing contribution of all nodes in a +computational graph of a model using CD-T followed by pruning, we are able to +reduce circuit discovery runtime from hours to seconds compared to +state-of-the-art baselines. On three standard circuit evaluation datasets +(indirect object identification, greater-than comparisons, and docstring +completion), we demonstrate that CD-T outperforms ACDC and EAP by better +recovering the manual circuits with an average of 97% ROC AUC under low +runtimes. In addition, we provide evidence that faithfulness of CD-T circuits +is not due to random chance by showing our circuits are 80% more faithful than +random circuits of up to 60% of the original model size. Finally, we show CD-T +circuits are able to perfectly replicate original models' behavior +(faithfulness $ = 1$) using fewer nodes than the baselines for all tasks. Our +results underscore the great promise of CD-T for efficient automated +mechanistic interpretability, paving the way for new insights into the workings +of large language models. + +
+
+
+
+
+ + ♻ ☆ STMA: A Spatio-Temporal Memory Agent for Long-Horizon Embodied Task + Planning + + +
+ A key objective of embodied intelligence is enabling agents to perform +long-horizon tasks in dynamic environments while maintaining robust +decision-making and adaptability. To achieve this goal, we propose the +Spatio-Temporal Memory Agent (STMA), a novel framework designed to enhance task +planning and execution by integrating spatio-temporal memory. STMA is built +upon three critical components: (1) a spatio-temporal memory module that +captures historical and environmental changes in real time, (2) a dynamic +knowledge graph that facilitates adaptive spatial reasoning, and (3) a +planner-critic mechanism that iteratively refines task strategies. We evaluate +STMA in the TextWorld environment on 32 tasks, involving multi-step planning +and exploration under varying levels of complexity. Experimental results +demonstrate that STMA achieves a 31.25% improvement in success rate and a 24.7% +increase in average score compared to the state-of-the-art model. The results +highlight the effectiveness of spatio-temporal memory in advancing the memory +capabilities of embodied agents. + +
+
+
+
+
+ + ♻ ☆ Exploring the Decentraland Economy: Multifaceted Parcel Attributes, Key + Insights, and Benchmarking + + +
+ This paper presents a comprehensive Decentraland parcels dataset, called +IITP-VDLand, sourced from diverse platforms such as Decentraland, OpenSea, +Etherscan, Google BigQuery, and various Social Media Platforms. Unlike existing +datasets which have limited attributes and records, IITP-VDLand offers a rich +array of attributes, encompassing parcel characteristics, trading history, past +activities, transactions, and social media interactions. Alongside, we +introduce a key attribute in the dataset, namely Rarity score, which measures +the uniqueness of each parcel within the virtual world. Addressing the +significant challenge posed by the dispersed nature of this data across various +sources, we employ a systematic approach, utilizing both available APIs and +custom scripts, to gather it. Subsequently, we meticulously curate and organize +the information into four distinct fragments: (1) Characteristics, (2) OpenSea +Trading History, (3) Ethereum Activity Transactions, and (4) Social Media. We +envisage that this dataset would serve as a robust resource for training +machine- and deep-learning models specifically designed to address real-world +challenges within the domain of Decentraland parcels. The performance +benchmarking of more than 20 state-of-the-art price prediction models on our +dataset yields promising results, achieving a maximum R2 score of 0.8251 and an +accuracy of 74.23% in case of Extra Trees Regressor and Classifier. The key +findings reveal that the ensemble models perform better than both deep learning +and linear models for our dataset. We observe a significant impact of +coordinates, geographical proximity, rarity score, and few other economic +indicators on the prediction of parcel prices. + +
+
+
+
+
+ + ♻ ☆ LANTERN: Accelerating Visual Autoregressive Models with Relaxed + Speculative Decoding ICLR 2025 + + +
+ Auto-Regressive (AR) models have recently gained prominence in image +generation, often matching or even surpassing the performance of diffusion +models. However, one major limitation of AR models is their sequential nature, +which processes tokens one at a time, slowing down generation compared to +models like GANs or diffusion-based methods that operate more efficiently. +While speculative decoding has proven effective for accelerating LLMs by +generating multiple tokens in a single forward, its application in visual AR +models remains largely unexplored. In this work, we identify a challenge in +this setting, which we term \textit{token selection ambiguity}, wherein visual +AR models frequently assign uniformly low probabilities to tokens, hampering +the performance of speculative decoding. To overcome this challenge, we propose +a relaxed acceptance condition referred to as LANTERN that leverages the +interchangeability of tokens in latent space. This relaxation restores the +effectiveness of speculative decoding in visual AR models by enabling more +flexible use of candidate tokens that would otherwise be prematurely rejected. +Furthermore, by incorporating a total variation distance bound, we ensure that +these speed gains are achieved without significantly compromising image quality +or semantic coherence. Experimental results demonstrate the efficacy of our +method in providing a substantial speed-up over speculative decoding. In +specific, compared to a na\"ive application of the state-of-the-art speculative +decoding, LANTERN increases speed-ups by $\mathbf{1.75}\times$ and +$\mathbf{1.82}\times$, as compared to greedy decoding and random sampling, +respectively, when applied to LlamaGen, a contemporary visual AR model. The +code is publicly available at https://github.com/jadohu/LANTERN. + +
+
+ comment: 30 pages, 13 figures, Accepted to ICLR 2025 (poster) +
+
+
+
+
+ + ♻ ☆ MobA: Multifaceted Memory-Enhanced Adaptive Planning for Efficient + Mobile Task Automation NAACL 2025 + + +
+ Existing Multimodal Large Language Model (MLLM)-based agents face significant +challenges in handling complex GUI (Graphical User Interface) interactions on +devices. These challenges arise from the dynamic and structured nature of GUI +environments, which integrate text, images, and spatial relationships, as well +as the variability in action spaces across different pages and tasks. To +address these limitations, we propose MobA, a novel MLLM-based mobile assistant +system. MobA introduces an adaptive planning module that incorporates a +reflection mechanism for error recovery and dynamically adjusts plans to align +with the real environment contexts and action module's execution capacity. +Additionally, a multifaceted memory module provides comprehensive memory +support to enhance adaptability and efficiency. We also present MobBench, a +dataset designed for complex mobile interactions. Experimental results on +MobBench and AndroidArena demonstrate MobA's ability to handle dynamic GUI +environments and perform complex mobile task. + +
+
+ comment: NAACL 2025 Demo Track +
+
+
+
+
+ + ♻ ☆ Improving vision-language alignment with graph spiking hybrid Networks + + +
+ To bridge the semantic gap between vision and language (VL), it is necessary +to develop a good alignment strategy, which includes handling semantic +diversity, abstract representation of visual information, and generalization +ability of models. Recent works use detector-based bounding boxes or patches +with regular partitions to represent visual semantics. While current paradigms +have made strides, they are still insufficient for fully capturing the nuanced +contextual relations among various objects. This paper proposes a comprehensive +visual semantic representation module, necessitating the utilization of +panoptic segmentation to generate coherent fine-grained semantic features. +Furthermore, we propose a novel Graph Spiking Hybrid Network (GSHN) that +integrates the complementary advantages of Spiking Neural Networks (SNNs) and +Graph Attention Networks (GATs) to encode visual semantic information. +Intriguingly, the model not only encodes the discrete and continuous latent +variables of instances but also adeptly captures both local and global +contextual features, thereby significantly enhancing the richness and diversity +of semantic representations. Leveraging the spatiotemporal properties inherent +in SNNs, we employ contrastive learning (CL) to enhance the similarity-based +representation of embeddings. This strategy alleviates the computational +overhead of the model and enriches meaningful visual representations by +constructing positive and negative sample pairs. We design an innovative +pre-training method, Spiked Text Learning (STL), which uses text features to +improve the encoding ability of discrete semantics. Experiments show that the +proposed GSHN exhibits promising results on multiple VL downstream tasks. + +
+
+
+
+
+ + ♻ ☆ Examining Alignment of Large Language Models through Representative + Heuristics: The Case of Political Stereotypes ICLR 2025 + + +
+ Examining the alignment of large language models (LLMs) has become +increasingly important, e.g., when LLMs fail to operate as intended. This study +examines the alignment of LLMs with human values for the domain of politics. +Prior research has shown that LLM-generated outputs can include political +leanings and mimic the stances of political parties on various issues. However, +the extent and conditions under which LLMs deviate from empirical positions are +insufficiently examined. To address this gap, we analyze the factors that +contribute to LLMs' deviations from empirical positions on political issues, +aiming to quantify these deviations and identify the conditions that cause +them. + Drawing on findings from cognitive science about representativeness +heuristics, i.e., situations where humans lean on representative attributes of +a target group in a way that leads to exaggerated beliefs, we scrutinize LLM +responses through this heuristics' lens. We conduct experiments to determine +how LLMs inflate predictions about political parties, which results in +stereotyping. We find that while LLMs can mimic certain political parties' +positions, they often exaggerate these positions more than human survey +respondents do. Also, LLMs tend to overemphasize representativeness more than +humans. This study highlights the susceptibility of LLMs to representativeness +heuristics, suggesting a potential vulnerability of LLMs that facilitates +political stereotyping. We also test prompt-based mitigation strategies, +finding that strategies that can mitigate representative heuristics in humans +are also effective in reducing the influence of representativeness on +LLM-generated responses. + +
+
+ comment: ICLR 2025 +
+
+
+
+
+ + ♻ ☆ SCC-YOLO: An Improved Object Detector for Assisting in Brain Tumor + Diagnosis + + +
+ Brain tumors can lead to neurological dysfunction, cognitive and +psychological changes, increased intracranial pressure, and seizures, posing +significant risks to health. The You Only Look Once (YOLO) series has shown +superior accuracy in medical imaging object detection. This paper presents a +novel SCC-YOLO architecture that integrates the SCConv module into YOLOv9. The +SCConv module optimizes convolutional efficiency by reducing spatial and +channel redundancy, enhancing image feature learning. We examine the effects of +different attention mechanisms with YOLOv9 for brain tumor detection using the +Br35H dataset and our custom dataset (Brain_Tumor_Dataset). Results indicate +that SCC-YOLO improved mAP50 by 0.3% on the Br35H dataset and by 0.5% on our +custom dataset compared to YOLOv9. SCC-YOLO achieves state-of-the-art +performance in brain tumor detection. + +
+
+
+
+
+ + ♻ ☆ Data-adaptive Differentially Private Prompt Synthesis for In-Context + Learning ICLR 2025 + + +
+ Large Language Models (LLMs) rely on the contextual information embedded in +examples/demonstrations to perform in-context learning (ICL). To mitigate the +risk of LLMs potentially leaking private information contained in examples in +the prompt, we introduce a novel data-adaptive differentially private algorithm +called AdaDPSyn to generate synthetic examples from the private dataset and +then use these synthetic examples to perform ICL. The objective of AdaDPSyn is +to adaptively adjust the noise level in the data synthesis mechanism according +to the inherent statistical properties of the data, thereby preserving high ICL +accuracy while maintaining formal differential privacy guarantees. A key +innovation in AdaDPSyn is the Precision-Focused Iterative Radius Reduction +technique, which dynamically refines the aggregation radius - the scope of data +grouping for noise addition - based on patterns observed in data clustering, +thereby minimizing the amount of additive noise. We conduct extensive +experiments on standard benchmarks and compare AdaDPSyn with DP few-shot +generation algorithm (Tang et al., 2023). The experiments demonstrate that +AdaDPSyn not only outperforms DP few-shot generation, but also maintains high +accuracy levels close to those of non-private baselines, providing an effective +solution for ICL with privacy protection. + +
+
+ comment: Accepted to ICLR 2025 +
+
+
+
+
+ + ♻ ☆ SeqAR: Jailbreak LLMs with Sequential Auto-Generated Characters NAACL 2025 + + +
+ The widespread applications of large language models (LLMs) have brought +about concerns regarding their potential misuse. Although aligned with human +preference data before release, LLMs remain vulnerable to various malicious +attacks. In this paper, we adopt a red-teaming strategy to enhance LLM safety +and introduce SeqAR, a simple yet effective framework to design jailbreak +prompts automatically. The SeqAR framework generates and optimizes multiple +jailbreak characters and then applies sequential jailbreak characters in a +single query to bypass the guardrails of the target LLM. Different from +previous work which relies on proprietary LLMs or seed jailbreak templates +crafted by human expertise, SeqAR can generate and optimize the jailbreak +prompt in a cold-start scenario using open-sourced LLMs without any seed +jailbreak templates. Experimental results show that SeqAR achieves attack +success rates of 88% and 60% in bypassing the safety alignment of GPT-3.5-1106 +and GPT-4, respectively. Furthermore, we extensively evaluate the +transferability of the generated templates across different LLMs and held-out +malicious requests, while also exploring defense strategies against the +jailbreak attack designed by SeqAR. + +
+
+ comment: Accepted by NAACL 2025 +
+
+
+
+
+ + ♻ ☆ MACPO: Weak-to-Strong Alignment via Multi-Agent Contrastive Preference + Optimization ICLR 2025 + + +
+ As large language models (LLMs) are rapidly advancing and achieving +near-human capabilities on specific tasks, aligning them with human values is +becoming more urgent. In scenarios where LLMs outperform humans, we face a +weak-to-strong alignment problem where we need to effectively align strong +student LLMs through weak supervision generated by weak teachers. Existing +alignment methods mainly focus on strong-to-weak alignment and self-alignment +settings, and it is impractical to adapt them to the much harder weak-to-strong +alignment setting. To fill this gap, we propose a multi-agent contrastive +preference optimization (MACPO) framework. MACPO facilitates weak teachers and +strong students to learn from each other by iteratively reinforcing unfamiliar +positive behaviors while penalizing familiar negative ones. To get this, we +devise a mutual positive behavior augmentation strategy to encourage weak +teachers and strong students to learn from each other's positive behavior and +further provide higher quality positive behavior for the next iteration. +Additionally, we propose a hard negative behavior construction strategy to +induce weak teachers and strong students to generate familiar negative behavior +by fine-tuning on negative behavioral data. Experimental results on the HH-RLHF +and PKU-SafeRLHF datasets, evaluated using both automatic metrics and human +judgments, demonstrate that MACPO simultaneously improves the alignment +performance of strong students and weak teachers. Moreover, as the number of +weak teachers increases, MACPO achieves better weak-to-strong alignment +performance through more iteration optimization rounds. + +
+
+ comment: ICLR 2025 +
+
+
+
+
+ + ♻ ☆ EMT: A Visual Multi-Task Benchmark Dataset for Autonomous Driving in the + Arab Gulf Region + + +
+ This paper introduces the Emirates Multi-Task (EMT) dataset - the first +publicly available dataset for autonomous driving collected in the Arab Gulf +region. The EMT dataset captures the unique road topology, high traffic +congestion, and distinctive characteristics of the Gulf region, including +variations in pedestrian clothing and weather conditions. It contains over +30,000 frames from a dash-camera perspective, along with 570,000 annotated +bounding boxes, covering approximately 150 kilometers of driving routes. The +EMT dataset supports three primary tasks: tracking, trajectory forecasting and +intention prediction. Each benchmark dataset is complemented with corresponding +evaluations: (1) multi-agent tracking experiments, focusing on multi-class +scenarios and occlusion handling; (2) trajectory forecasting evaluation using +deep sequential and interaction-aware models; and (3) intention benchmark +experiments conducted for predicting agents intentions from observed +trajectories. The dataset is publicly available at avlab.io/emt-dataset, and +pre-processing scripts along with evaluation models can be accessed at +github.com/AV-Lab/emt-dataset. + +
+
+ comment: 19 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ User Intent to Use DeepSeek for Healthcare Purposes and their Trust in + the Large Language Model: Multinational Survey Study + + +
+ Large language models (LLMs) increasingly serve as interactive healthcare +resources, yet user acceptance remains underexplored. This study examines how +ease of use, perceived usefulness, trust, and risk perception interact to shape +intentions to adopt DeepSeek, an emerging LLM-based platform, for healthcare +purposes. A cross-sectional survey of 556 participants from India, the United +Kingdom, and the United States was conducted to measure perceptions and usage +patterns. Structural equation modeling assessed both direct and indirect +effects, including potential quadratic relationships. Results revealed that +trust plays a pivotal mediating role: ease of use exerts a significant indirect +effect on usage intentions through trust, while perceived usefulness +contributes to both trust development and direct adoption. By contrast, risk +perception negatively affects usage intent, emphasizing the importance of +robust data governance and transparency. Notably, significant non-linear paths +were observed for ease of use and risk, indicating threshold or plateau +effects. The measurement model demonstrated strong reliability and validity, +supported by high composite reliabilities, average variance extracted, and +discriminant validity measures. These findings extend technology acceptance and +health informatics research by illuminating the multifaceted nature of user +adoption in sensitive domains. Stakeholders should invest in trust-building +strategies, user-centric design, and risk mitigation measures to encourage +sustained and safe uptake of LLMs in healthcare. Future work can employ +longitudinal designs or examine culture-specific variables to further clarify +how user perceptions evolve over time and across different regulatory +environments. Such insights are critical for harnessing AI to enhance outcomes. + +
+
+
+
+
+ + ♻ ☆ Automated Design of Agentic Systems + + +
+ Researchers are investing substantial effort in developing powerful +general-purpose agents, wherein Foundation Models are used as modules within +agentic systems (e.g. Chain-of-Thought, Self-Reflection, Toolformer). However, +the history of machine learning teaches us that hand-designed solutions are +eventually replaced by learned solutions. We describe a newly forming research +area, Automated Design of Agentic Systems (ADAS), which aims to automatically +create powerful agentic system designs, including inventing novel building +blocks and/or combining them in new ways. We further demonstrate that there is +an unexplored yet promising approach within ADAS where agents can be defined in +code and new agents can be automatically discovered by a meta agent programming +ever better ones in code. Given that programming languages are Turing Complete, +this approach theoretically enables the learning of any possible agentic +system: including novel prompts, tool use, workflows, and combinations thereof. +We present a simple yet effective algorithm named Meta Agent Search to +demonstrate this idea, where a meta agent iteratively programs interesting new +agents based on an ever-growing archive of previous discoveries. Through +extensive experiments across multiple domains including coding, science, and +math, we show that our algorithm can progressively invent agents with novel +designs that greatly outperform state-of-the-art hand-designed agents. +Importantly, we consistently observe the surprising result that agents invented +by Meta Agent Search maintain superior performance even when transferred across +domains and models, demonstrating their robustness and generality. Provided we +develop it safely, our work illustrates the potential of an exciting new +research direction toward automatically designing ever-more powerful agentic +systems to benefit humanity. + +
+
+ comment: Website: https://shengranhu.com/ADAS +
+
+
+
+
+ + ♻ ☆ L3Ms -- Lagrange Large Language Models ICLR + + +
+ Supervised fine-tuning (SFT) and alignment of large language models (LLMs) +are key steps in providing a good user experience. However, the concept of an +appropriate alignment is inherently application-dependent, and current methods +often rely on heuristic choices to drive optimization. In this work, we +formulate SFT and alignment as a constrained optimization problem: the LLM is +fine-tuned on a task while being required to meet application-specific +requirements, without resorting to heuristics. To solve this, we propose +Lagrange Large Language Models (L3Ms), which employ logarithmic barriers to +enforce the constraints. This approach allows for the customization of L3Ms +across diverse applications while avoiding heuristic-driven processes. We +experimentally demonstrate the versatility and efficacy of L3Ms in achieving +tailored alignments for various applications. + +
+
+ comment: International Conference on Learning Representations (ICLR), 2025 +
+
+
+
+
+ + ♻ ☆ An Empirical Analysis of Uncertainty in Large Language Model Evaluations ICLR 2025 + + +
+ As LLM-as-a-Judge emerges as a new paradigm for assessing large language +models (LLMs), concerns have been raised regarding the alignment, bias, and +stability of LLM evaluators. While substantial work has focused on alignment +and bias, little research has concentrated on the stability of LLM evaluators. +In this paper, we conduct extensive experiments involving 9 widely used LLM +evaluators across 2 different evaluation settings to investigate the +uncertainty in model-based LLM evaluations. We pinpoint that LLM evaluators +exhibit varying uncertainty based on model families and sizes. With careful +comparative analyses, we find that employing special prompting strategies, +whether during inference or post-training, can alleviate evaluation uncertainty +to some extent. By utilizing uncertainty to enhance LLM's reliability and +detection capability in Out-Of-Distribution (OOD) data, we further fine-tune an +uncertainty-aware LLM evaluator named ConfiLM using a human-annotated +fine-tuning set and assess ConfiLM's OOD evaluation ability on a manually +designed test set sourced from the 2024 Olympics. Experimental results +demonstrate that incorporating uncertainty as additional information during the +fine-tuning phase can largely improve the model's evaluation performance in OOD +scenarios. The code and data are released at: +https://github.com/hasakiXie123/LLM-Evaluator-Uncertainty. + +
+
+ comment: ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Snuffy: Efficient Whole Slide Image Classifier ECCV 2024 + + +
+ Whole Slide Image (WSI) classification with multiple instance learning (MIL) +in digital pathology faces significant computational challenges. Current +methods mostly rely on extensive self-supervised learning (SSL) for +satisfactory performance, requiring long training periods and considerable +computational resources. At the same time, no pre-training affects performance +due to domain shifts from natural images to WSIs. We introduce Snuffy +architecture, a novel MIL-pooling method based on sparse transformers that +mitigates performance loss with limited pre-training and enables continual +few-shot pre-training as a competitive option. Our sparsity pattern is tailored +for pathology and is theoretically proven to be a universal approximator with +the tightest probabilistic sharp bound on the number of layers for sparse +transformers, to date. We demonstrate Snuffy's effectiveness on CAMELYON16 and +TCGA Lung cancer datasets, achieving superior WSI and patch-level accuracies. +The code is available on https://github.com/jafarinia/snuffy. + +
+
+ comment: Accepted for ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Global $\mathcal{L}^2$ minimization at uniform exponential rate via + geometrically adapted gradient descent in Deep Learning + + +
+ We consider the scenario of supervised learning in Deep Learning (DL) +networks, and exploit the arbitrariness of choice in the Riemannian metric +relative to which the gradient descent flow can be defined (a general fact of +differential geometry). In the standard approach to DL, the gradient flow on +the space of parameters (weights and biases) is defined with respect to the +Euclidean metric. Here instead, we choose the gradient flow with respect to the +Euclidean metric in the output layer of the DL network. This naturally induces +two modified versions of the gradient descent flow in the parameter space, one +adapted for the overparametrized setting, and the other for the +underparametrized setting. In the overparametrized case, we prove that, +provided that a rank condition holds, all orbits of the modified gradient +descent drive the ${\mathcal L}^2$ cost to its global minimum at a uniform +exponential convergence rate; one thereby obtains an a priori stopping time for +any prescribed proximity to the global minimum. We point out relations of the +latter to sub-Riemannian geometry. Moreover, we generalize the above framework +to the situation in which the rank condition does not hold; in particular, we +show that local equilibria can only exist if a rank loss occurs, and that +generically, they are not isolated points, but elements of a critical +submanifold of parameter space. + +
+
+ comment: AMS Latex, 20 pages. Typos corrected, references and comments added +
+
+
+
+
+ + ♻ ☆ A Survey on Large Language Model based Autonomous Agents + + +
+ Autonomous agents have long been a prominent research focus in both academic +and industry communities. Previous research in this field often focuses on +training agents with limited knowledge within isolated environments, which +diverges significantly from human learning processes, and thus makes the agents +hard to achieve human-like decisions. Recently, through the acquisition of vast +amounts of web knowledge, large language models (LLMs) have demonstrated +remarkable potential in achieving human-level intelligence. This has sparked an +upsurge in studies investigating LLM-based autonomous agents. In this paper, we +present a comprehensive survey of these studies, delivering a systematic review +of the field of LLM-based autonomous agents from a holistic perspective. More +specifically, we first discuss the construction of LLM-based autonomous agents, +for which we propose a unified framework that encompasses a majority of the +previous work. Then, we present a comprehensive overview of the diverse +applications of LLM-based autonomous agents in the fields of social science, +natural science, and engineering. Finally, we delve into the evaluation +strategies commonly used for LLM-based autonomous agents. Based on the previous +studies, we also present several challenges and future directions in this +field. To keep track of this field and continuously update our survey, we +maintain a repository of relevant references at +https://github.com/Paitesanshi/LLM-Agent-Survey. + +
+
+ comment: Correcting several typos, 35 pages, 5 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Generalization v.s. Memorization: Tracing Language Models' Capabilities + Back to Pretraining Data ICLR 2025 + + +
+ The impressive capabilities of large language models (LLMs) have sparked +debate over whether these models genuinely generalize to unseen tasks or +predominantly rely on memorizing vast amounts of pretraining data. To explore +this issue, we introduce an extended concept of memorization, distributional +memorization, which measures the correlation between the LLM output +probabilities and the pretraining data frequency. To effectively capture +task-specific pretraining data frequency, we propose a novel task-gram language +model, which is built by counting the co-occurrence of semantically related +$n$-gram pairs from task inputs and outputs in the pretraining corpus. Using +the Pythia models trained on the Pile dataset, we evaluate four distinct tasks: +machine translation, factual question answering, world knowledge understanding, +and math reasoning. Our findings reveal varying levels of memorization, with +the strongest effect observed in factual question answering. Furthermore, while +model performance improves across all tasks as LLM size increases, only factual +question answering shows an increase in memorization, whereas machine +translation and reasoning tasks exhibit greater generalization, producing more +novel outputs. This study demonstrates that memorization plays a larger role in +simpler, knowledge-intensive tasks, while generalization is the key for harder, +reasoning-based tasks, providing a scalable method for analyzing large +pretraining corpora in greater depth. + +
+
+ comment: Accepted to ICLR 2025 +
+
+
+
+
+ + ♻ ☆ CBraMod: A Criss-Cross Brain Foundation Model for EEG Decoding ICLR 2025 + + +
+ Electroencephalography (EEG) is a non-invasive technique to measure and +record brain electrical activity, widely used in various BCI and healthcare +applications. Early EEG decoding methods rely on supervised learning, limited +by specific tasks and datasets, hindering model performance and +generalizability. With the success of large language models, there is a growing +body of studies focusing on EEG foundation models. However, these studies still +leave challenges: Firstly, most of existing EEG foundation models employ full +EEG modeling strategy. It models the spatial and temporal dependencies between +all EEG patches together, but ignores that the spatial and temporal +dependencies are heterogeneous due to the unique structural characteristics of +EEG signals. Secondly, existing EEG foundation models have limited +generalizability on a wide range of downstream BCI tasks due to varying formats +of EEG data, making it challenging to adapt to. To address these challenges, we +propose a novel foundation model called CBraMod. Specifically, we devise a +criss-cross transformer as the backbone to thoroughly leverage the structural +characteristics of EEG signals, which can model spatial and temporal +dependencies separately through two parallel attention mechanisms. And we +utilize an asymmetric conditional positional encoding scheme which can encode +positional information of EEG patches and be easily adapted to the EEG with +diverse formats. CBraMod is pre-trained on a very large corpus of EEG through +patch-based masked EEG reconstruction. We evaluate CBraMod on up to 10 +downstream BCI tasks (12 public datasets). CBraMod achieves the +state-of-the-art performance across the wide range of tasks, proving its strong +capability and generalizability. The source code is publicly available at +https://github.com/wjq-learning/CBraMod. + +
+
+ comment: Accepted by The Thirteenth International Conference on Learning + Representations (ICLR 2025) +
+
+
+
+
+ + ♻ ☆ TFG-Flow: Training-free Guidance in Multimodal Generative Flow + + +
+ Given an unconditional generative model and a predictor for a target property +(e.g., a classifier), the goal of training-free guidance is to generate samples +with desirable target properties without additional training. As a highly +efficient technique for steering generative models toward flexible outcomes, +training-free guidance has gained increasing attention in diffusion models. +However, existing methods only handle data in continuous spaces, while many +scientific applications involve both continuous and discrete data (referred to +as multimodality). Another emerging trend is the growing use of the simple and +general flow matching framework in building generative foundation models, where +guided generation remains under-explored. To address this, we introduce +TFG-Flow, a novel training-free guidance method for multimodal generative flow. +TFG-Flow addresses the curse-of-dimensionality while maintaining the property +of unbiased sampling in guiding discrete variables. We validate TFG-Flow on +four molecular design tasks and show that TFG-Flow has great potential in drug +design by generating molecules with desired properties. + +
+
+
+
+
+ + ♻ ☆ Efficient Imitation Without Demonstrations via Value-Penalized Auxiliary + Control from Examples ICRA'25 + + +
+ Common approaches to providing feedback in reinforcement learning are the use +of hand-crafted rewards or full-trajectory expert demonstrations. +Alternatively, one can use examples of completed tasks, but such an approach +can be extremely sample inefficient. We introduce value-penalized auxiliary +control from examples (VPACE), an algorithm that significantly improves +exploration in example-based control by adding examples of simple auxiliary +tasks and an above-success-level value penalty. Across both simulated and real +robotic environments, we show that our approach substantially improves learning +efficiency for challenging tasks, while maintaining bounded value estimates. +Preliminary results also suggest that VPACE may learn more efficiently than the +more common approaches of using full trajectories or true sparse rewards. +Project site: https://papers.starslab.ca/vpace/ . + +
+
+ comment: Accepted to the IEEE International Conference on Robotics and + Automation (ICRA'25), Atlanta, USA, May 19-23, 2025 +
+
+
+
+
+ + ♻ ☆ Image Watermarks are Removable Using Controllable Regeneration from + Clean Noise ICLR2025 + + +
+ Image watermark techniques provide an effective way to assert ownership, +deter misuse, and trace content sources, which has become increasingly +essential in the era of large generative models. A critical attribute of +watermark techniques is their robustness against various manipulations. In this +paper, we introduce a watermark removal approach capable of effectively +nullifying state-of-the-art watermarking techniques. Our primary insight +involves regenerating the watermarked image starting from a clean Gaussian +noise via a controllable diffusion model, utilizing the extracted semantic and +spatial features from the watermarked image. The semantic control adapter and +the spatial control network are specifically trained to control the denoising +process towards ensuring image quality and enhancing consistency between the +cleaned image and the original watermarked image. To achieve a smooth trade-off +between watermark removal performance and image consistency, we further propose +an adjustable and controllable regeneration scheme. This scheme adds varying +numbers of noise steps to the latent representation of the watermarked image, +followed by a controlled denoising process starting from this noisy latent +representation. As the number of noise steps increases, the latent +representation progressively approaches clean Gaussian noise, facilitating the +desired trade-off. We apply our watermark removal methods across various +watermarking techniques, and the results demonstrate that our methods offer +superior visual consistency/quality and enhanced watermark removal performance +compared to existing regeneration approaches. Our code is available at +https://github.com/yepengliu/CtrlRegen. + +
+
+ comment: ICLR2025 +
+
+
+
+
+ + ♻ ☆ ACES: Automatic Cohort Extraction System for Event-Stream Datasets ICLR 2025 + + +
+ Reproducibility remains a significant challenge in machine learning (ML) for +healthcare. Datasets, model pipelines, and even task or cohort definitions are +often private in this field, leading to a significant barrier in sharing, +iterating, and understanding ML results on electronic health record (EHR) +datasets. We address a significant part of this problem by introducing the +Automatic Cohort Extraction System (ACES) for event-stream data. This library +is designed to simultaneously simplify the development of tasks and cohorts for +ML in healthcare and also enable their reproduction, both at an exact level for +single datasets and at a conceptual level across datasets. To accomplish this, +ACES provides: (1) a highly intuitive and expressive domain-specific +configuration language for defining both dataset-specific concepts and +dataset-agnostic inclusion or exclusion criteria, and (2) a pipeline to +automatically extract patient records that meet these defined criteria from +real-world data. ACES can be automatically applied to any dataset in either the +Medical Event Data Standard (MEDS) or Event Stream GPT (ESGPT) formats, or to +*any* dataset in which the necessary task-specific predicates can be extracted +in an event-stream form. ACES has the potential to significantly lower the +barrier to entry for defining ML tasks in representation learning, redefine the +way researchers interact with EHR datasets, and significantly improve the state +of reproducibility for ML studies using this modality. ACES is available at: +https://github.com/justin13601/aces. + +
+
+ comment: [ICLR 2025] For the latest ACES online documentation, please see + https://eventstreamaces.readthedocs.io/en/latest/ +
+
+
+
+
+ + ♻ ☆ L3Ms - Lagrange Large Language Models ICLR + + +
+ Supervised fine-tuning (SFT) and alignment of large language models (LLMs) +are key steps in providing a good user experience. However, the concept of an +appropriate alignment is inherently application-dependent, and current methods +often rely on heuristic choices to drive optimization. In this work, we +formulate SFT and alignment as a constrained optimization problem: the LLM is +fine-tuned on a task while being required to meet application-specific +requirements, without resorting to heuristics. To solve this, we propose +Lagrange Large Language Models (L3Ms), which employ logarithmic barriers to +enforce the constraints. This approach allows for the customization of L3Ms +across diverse applications while avoiding heuristic-driven processes. We +experimentally demonstrate the versatility and efficacy of L3Ms in achieving +tailored alignments for various applications. + +
+
+ comment: International Conference on Learning Representations (ICLR), 2025 +
+
+
+
+
+
+
+
+ + Genomics 1 + +
+
+
+ + ♻ ☆ SARS-CoV-2 Wastewater Genomic Surveillance: Approaches, Challenges, and + Opportunities + + +
+ During the SARS-CoV-2 pandemic, wastewater-based genomic surveillance (WWGS) +emerged as an efficient viral surveillance tool that takes into account +asymptomatic cases and can identify known and novel mutations and offers the +opportunity to assign known virus lineages based on the detected mutations +profiles. WWGS can also hint towards novel or cryptic lineages, but it is +difficult to clearly identify and define novel lineages from wastewater (WW) +alone. While WWGS has significant advantages in monitoring SARS-CoV-2 viral +spread, technical challenges remain, including poor sequencing coverage and +quality due to viral RNA degradation. As a result, the viral RNAs in wastewater +have low concentrations and are often fragmented, making sequencing difficult. +WWGS analysis requires advanced computational tools that are yet to be +developed and benchmarked. The existing bioinformatics tools used to analyze +wastewater sequencing data are often based on previously developed methods for +quantifying the expression of transcripts or viral diversity. Those methods +were not developed for wastewater sequencing data specifically, and are not +optimized to address unique challenges associated with wastewater. While +specialized tools for analysis of wastewater sequencing data have also been +developed recently, it remains to be seen how they will perform given the +ongoing evolution of SARS-CoV-2 and the decline in testing and patient-based +genomic surveillance. Here, we discuss opportunities and challenges associated +with WWGS, including sample preparation, sequencing technology, and +bioinformatics methods. + +
+
+ comment: V Munteanu and M Saldana contributed equally to this work. M + H\"olzer, A Smith and S Mangul jointly supervised this work. For + correspondence: serghei.mangul@gmail.com +
+
+
+
+
+
+
+
+ + Machine Learning 64 + +
+
+
+ + ♻ ☆ Eagle: Exploring The Design Space for Multimodal LLMs with Mixture of + Encoders + + +
+ The ability to accurately interpret complex visual information is a crucial +topic of multimodal large language models (MLLMs). Recent work indicates that +enhanced visual perception significantly reduces hallucinations and improves +performance on resolution-sensitive tasks, such as optical character +recognition and document analysis. A number of recent MLLMs achieve this goal +using a mixture of vision encoders. Despite their success, there is a lack of +systematic comparisons and detailed ablation studies addressing critical +aspects, such as expert selection and the integration of multiple vision +experts. This study provides an extensive exploration of the design space for +MLLMs using a mixture of vision encoders and resolutions. Our findings reveal +several underlying principles common to various existing strategies, leading to +a streamlined yet effective design approach. We discover that simply +concatenating visual tokens from a set of complementary vision encoders is as +effective as more complex mixing architectures or strategies. We additionally +introduce Pre-Alignment to bridge the gap between vision-focused encoders and +language tokens, enhancing model coherence. The resulting family of MLLMs, +Eagle, surpasses other leading open-source models on major MLLM benchmarks. + +
+
+ comment: Github: https://github.com/NVlabs/Eagle, HuggingFace: + https://huggingface.co/NVEagle +
+
+
+
+
+ + ♻ ☆ SymbolFit: Automatic Parametric Modeling with Symbolic Regression + + +
+ We introduce SymbolFit, a framework that automates parametric modeling by +using symbolic regression to perform a machine-search for functions that fit +the data while simultaneously providing uncertainty estimates in a single run. +Traditionally, constructing a parametric model to accurately describe binned +data has been a manual and iterative process, requiring an adequate functional +form to be determined before the fit can be performed. The main challenge +arises when the appropriate functional forms cannot be derived from first +principles, especially when there is no underlying true closed-form function +for the distribution. In this work, we develop a framework that automates and +streamlines the process by utilizing symbolic regression, a machine learning +technique that explores a vast space of candidate functions without requiring a +predefined functional form because the functional form itself is treated as a +trainable parameter, making the process far more efficient and effortless than +traditional regression methods. We demonstrate the framework in high-energy +physics experiments at the CERN Large Hadron Collider (LHC) using five real +proton-proton collision datasets from new physics searches, including +background modeling in resonance searches for high-mass dijet, trijet, +paired-dijet, diphoton, and dimuon events. We show that our framework can +flexibly and efficiently generate a wide range of candidate functions that fit +a nontrivial distribution well using a simple fit configuration that varies +only by random seed, and that the same fit configuration, which defines a vast +function space, can also be applied to distributions of different shapes, +whereas achieving a comparable result with traditional methods would have +required extensive manual effort. + +
+
+ comment: 50 pages, 35 figures. Under review. The API can be used + out-of-the-box and is available at https://github.com/hftsoi/symbolfit +
+
+
+
+
+ + ♻ ☆ Quantum time dynamics mediated by the Yang-Baxter equation and + artificial neural networks + + +
+ Quantum computing shows great potential, but errors pose a significant +challenge. This study explores new strategies for mitigating quantum errors +using artificial neural networks (ANN) and the Yang-Baxter equation (YBE). +Unlike traditional error mitigation methods, which are computationally +intensive, we investigate artificial error mitigation. We developed a novel +method that combines ANN for noise mitigation combined with the YBE to generate +noisy data. This approach effectively reduces noise in quantum simulations, +enhancing the accuracy of the results. The YBE rigorously preserves quantum +correlations and symmetries in spin chain simulations in certain classes of +integrable lattice models, enabling effective compression of quantum circuits +while retaining linear scalability with the number of qubits. This compression +facilitates both full and partial implementations, allowing the generation of +noisy quantum data on hardware alongside noiseless simulations using classical +platforms. By introducing controlled noise through the YBE, we enhance the +dataset for error mitigation. We train an ANN model on partial data from +quantum simulations, demonstrating its effectiveness in mitigating errors in +time-evolving quantum states, providing a scalable framework to enhance quantum +computation fidelity, particularly in noisy intermediate-scale quantum (NISQ) +systems. We demonstrate the efficacy of this approach by performing quantum +time dynamics simulations using the Heisenberg XY Hamiltonian on real quantum +devices. + +
+
+
+
+
+ + ♻ ☆ Heterogeneous Graph Neural Network on Semantic Tree AAAI 2025 + + +
+ The recent past has seen an increasing interest in Heterogeneous Graph Neural +Networks (HGNNs), since many real-world graphs are heterogeneous in nature, +from citation graphs to email graphs. However, existing methods ignore a tree +hierarchy among metapaths, naturally constituted by different node types and +relation types. In this paper, we present HetTree, a novel HGNN that models +both the graph structure and heterogeneous aspects in a scalable and effective +manner. Specifically, HetTree builds a semantic tree data structure to capture +the hierarchy among metapaths. To effectively encode the semantic tree, HetTree +uses a novel subtree attention mechanism to emphasize metapaths that are more +helpful in encoding parent-child relationships. Moreover, HetTree proposes +carefully matching pre-computed features and labels correspondingly, +constituting a complete metapath representation. Our evaluation of HetTree on a +variety of real-world datasets demonstrates that it outperforms all existing +baselines on open benchmarks and efficiently scales to large real-world graphs +with millions of nodes and edges. + +
+
+ comment: Accepted at AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Disentangling Representations through Multi-task Learning + + +
+ Intelligent perception and interaction with the world hinges on internal +representations that capture its underlying structure (''disentangled'' or +''abstract'' representations). Disentangled representations serve as world +models, isolating latent factors of variation in the world along approximately +orthogonal directions, thus facilitating feature-based generalization. We +provide experimental and theoretical results guaranteeing the emergence of +disentangled representations in agents that optimally solve multi-task evidence +accumulation classification tasks, canonical in the neuroscience literature. +The key conceptual finding is that, by producing accurate multi-task +classification estimates, a system implicitly represents a set of coordinates +specifying a disentangled representation of the underlying latent state of the +data it receives. The theory provides conditions for the emergence of these +representations in terms of noise, number of tasks, and evidence accumulation +time. We experimentally validate these predictions in RNNs trained to +multi-task, which learn disentangled representations in the form of continuous +attractors, leading to zero-shot out-of-distribution (OOD) generalization in +predicting latent factors. We demonstrate the robustness of our framework +across autoregressive architectures, decision boundary geometries and in tasks +requiring classification confidence estimation. We find that transformers are +particularly suited for disentangling representations, which might explain +their unique world understanding abilities. Overall, our framework establishes +a formal link between competence at multiple tasks and the formation of +disentangled, interpretable world models in both biological and artificial +systems, and helps explain why ANNs often arrive at human-interpretable +concepts, and how they both may acquire exceptional zero-shot generalization +capabilities. + +
+
+ comment: 43 pages, 17 figures +
+
+
+
+
+ + ♻ ☆ We Have a Package for You! A Comprehensive Analysis of Package + Hallucinations by Code Generating LLMs USENIX Security + + +
+ The reliance of popular programming languages such as Python and JavaScript +on centralized package repositories and open-source software, combined with the +emergence of code-generating Large Language Models (LLMs), has created a new +type of threat to the software supply chain: package hallucinations. These +hallucinations, which arise from fact-conflicting errors when generating code +using LLMs, represent a novel form of package confusion attack that poses a +critical threat to the integrity of the software supply chain. This paper +conducts a rigorous and comprehensive evaluation of package hallucinations +across different programming languages, settings, and parameters, exploring how +a diverse set of models and configurations affect the likelihood of generating +erroneous package recommendations and identifying the root causes of this +phenomenon. Using 16 popular LLMs for code generation and two unique prompt +datasets, we generate 576,000 code samples in two programming languages that we +analyze for package hallucinations. Our findings reveal that that the average +percentage of hallucinated packages is at least 5.2% for commercial models and +21.7% for open-source models, including a staggering 205,474 unique examples of +hallucinated package names, further underscoring the severity and pervasiveness +of this threat. To overcome this problem, we implement several hallucination +mitigation strategies and show that they are able to significantly reduce the +number of package hallucinations while maintaining code quality. Our +experiments and findings highlight package hallucinations as a persistent and +systemic phenomenon while using state-of-the-art LLMs for code generation, and +a significant challenge which deserves the research community's urgent +attention. + +
+
+ comment: To appear in the 2025 USENIX Security Symposium. 22 pages, 14 + figures, 8 tables. Edited from original version for submission to a different + conference. No change to original results or findings +
+
+
+
+
+ + ♻ ☆ Range, not Independence, Drives Modularity in Biologically Inspired + Representations + + +
+ Why do biological and artificial neurons sometimes modularise, each encoding +a single meaningful variable, and sometimes entangle their representation of +many variables? In this work, we develop a theory of when biologically inspired +networks -- those that are nonnegative and energy efficient -- modularise their +representation of source variables (sources). We derive necessary and +sufficient conditions on a sample of sources that determine whether the neurons +in an optimal biologically-inspired linear autoencoder modularise. Our theory +applies to any dataset, extending far beyond the case of statistical +independence studied in previous work. Rather we show that sources modularise +if their support is ``sufficiently spread''. From this theory, we extract and +validate predictions in a variety of empirical studies on how data distribution +affects modularisation in nonlinear feedforward and recurrent neural networks +trained on supervised and unsupervised tasks. Furthermore, we apply these ideas +to neuroscience data, showing that range independence can be used to understand +the mixing or modularising of spatial and reward information in entorhinal +recordings in seemingly conflicting experiments. Further, we use these results +to suggest alternate origins of mixed-selectivity, beyond the predominant +theory of flexible nonlinear classification. In sum, our theory prescribes +precise conditions on when neural activities modularise, providing tools for +inducing and elucidating modular representations in brains and machines. + +
+
+ comment: 47 pages, 17 figures. WD and KH contributed equally; LH and JHL + contributed equally +
+
+
+
+
+ + ♻ ☆ MetaGFN: Exploring Distant Modes with Adapted Metadynamics for + Continuous GFlowNets + + +
+ Generative Flow Networks (GFlowNets) are a class of generative models that +sample objects in proportion to a specified reward function through a learned +policy. They can be trained either on-policy or off-policy, needing a balance +between exploration and exploitation for fast convergence to a target +distribution. While exploration strategies for discrete GFlowNets have been +studied, exploration in the continuous case remains to be investigated, despite +the potential for novel exploration algorithms due to the local connectedness +of continuous domains. Here, we introduce Adapted Metadynamics, a variant of +metadynamics that can be applied to arbitrary black-box reward functions on +continuous domains. We use Adapted Metadynamics as an exploration strategy for +continuous GFlowNets. We show several continuous domains where the resulting +algorithm, MetaGFN, accelerates convergence to the target distribution and +discovers more distant reward modes than previous off-policy exploration +strategies used for GFlowNets. + +
+
+ comment: 22 pages +
+
+
+
+
+ + ♻ ☆ Linear Diffusion Networks + + +
+ Diffusion kernels capture global dependencies. We present Linear Diffusion +Networks (LDNs), a novel architecture that reinterprets sequential data +processing as a unified diffusion process. Our model integrates adaptive +diffusion modules with localized nonlinear updates and a diffusion-inspired +attention mechanism. This design enables efficient global information +propagation while preserving fine-grained temporal details. LDN overcomes the +limitations of conventional recurrent and transformer models by allowing full +parallelization across time steps and supporting robust multi-scale temporal +representations. Experiments on benchmark sequence modeling tasks demonstrate +that LDN delivers competitive performance across ImageNet and GLUE tasks. + +
+
+
+
+
+ + ♻ ☆ $μ$nit Scaling: Simple and Scalable FP8 LLM Training + + +
+ Large Language Model training with 8-bit floating point (FP8) formats +promises significant efficiency improvements, but reduced numerical precision +makes training challenging. It is currently possible to train in FP8 only if +one is willing to tune various hyperparameters, reduce model scale, or accept +the overhead of computing dynamic scale factors. We demonstrate simple, +scalable FP8 training that requires no dynamic scaling factors or special +hyperparameters, even at large model sizes. Our method, $\mu$nit Scaling +($\mu$S), also enables simple hyperparameter transfer across model widths, +matched numerics across training and inference, and other desirable properties. +$\mu$nit Scaling is straightforward to implement, consisting of a set of +minimal interventions based on a first-principles analysis of common +transformer operations. We validate our method by training models from 1B to +13B parameters, performing all hidden linear layer computations in FP8. We +achieve quality equal to higher precision baselines while also training up to +33% faster. + +
+
+
+
+
+ + ♻ ☆ Lean Copilot: Large Language Models as Copilots for Theorem Proving in + Lean + + +
+ Neural theorem proving combines large language models (LLMs) with proof +assistants such as Lean, where the correctness of formal proofs can be +rigorously verified, leaving no room for hallucination. With existing neural +theorem provers pretrained on a fixed collection of data and offering valuable +suggestions at times, it is challenging for them to continually prove novel +theorems in a fully autonomous mode, where human insights may be critical. In +this paper, we explore LLMs as copilots that assist humans in proving theorems. +We introduce Lean Copilot, an general framework for running LLM inference +natively in Lean. It enables programmers to build various LLM-based proof +automation tools that integrate seamlessly into the workflow of Lean users. +Lean users can use our pretrained models or bring their own ones that run +either locally (with or without GPUs) or on the cloud. Using Lean Copilot, we +build LLM-based tools that suggest proof steps, complete proof goals, and +select relevant premises. Experimental results on the Mathematics in Lean +textbook demonstrate the effectiveness of our method compared to existing +rule-based proof automation in Lean (aesop). When assisting humans, Lean +Copilot requires only 2.08 manually-entered proof steps on average (3.86 +required by aesop); when automating the theorem proving process, Lean Copilot +automates 74.2% proof steps on average, 85% better than aesop (40.1%). We open +source all code and artifacts under a permissive MIT license to facilitate +further research. + +
+
+ comment: All code and artifacts open-sourced at + https://github.com/lean-dojo/LeanCopilot +
+
+
+
+
+ + ♻ ☆ Fast Two-Time-Scale Stochastic Gradient Method with Applications in + Reinforcement Learning + + +
+ Two-time-scale optimization is a framework introduced in Zeng et al. (2024) +that abstracts a range of policy evaluation and policy optimization problems in +reinforcement learning (RL). Akin to bi-level optimization under a particular +type of stochastic oracle, the two-time-scale optimization framework has an +upper level objective whose gradient evaluation depends on the solution of a +lower level problem, which is to find the root of a strongly monotone operator. +In this work, we propose a new method for solving two-time-scale optimization +that achieves significantly faster convergence than the prior arts. The key +idea of our approach is to leverage an averaging step to improve the estimates +of the operators in both lower and upper levels before using them to update the +decision variables. These additional averaging steps eliminate the direct +coupling between the main variables, enabling the accelerated performance of +our algorithm. We characterize the finite-time convergence rates of the +proposed algorithm under various conditions of the underlying objective +function, including strong convexity, Polyak-Lojasiewicz condition, and general +non-convexity. These rates significantly improve over the best-known complexity +of the standard two-time-scale stochastic approximation algorithm. When applied +to RL, we show how the proposed algorithm specializes to novel online +sample-based methods that surpass or match the performance of the existing +state of the art. Finally, we support our theoretical results with numerical +simulations in RL. + +
+
+
+
+
+ + ♻ ☆ LDAdam: Adaptive Optimization from Low-Dimensional Gradient Statistics ICLR 2025 + + +
+ We introduce LDAdam, a memory-efficient optimizer for training large models, +that performs adaptive optimization steps within lower dimensional subspaces, +while consistently exploring the full parameter space during training. This +strategy keeps the optimizer's memory footprint to a fraction of the model +size. LDAdam relies on a new projection-aware update rule for the optimizer +states that allows for transitioning between subspaces, i.e., estimation of the +statistics of the projected gradients. To mitigate the errors due to low-rank +projection, LDAdam integrates a new generalized error feedback mechanism, which +explicitly accounts for both gradient and optimizer state compression. We prove +the convergence of LDAdam under standard assumptions, and show that LDAdam +allows for accurate and efficient fine-tuning and pre-training of language +models. Code is available at https://github.com/IST-DASLab/LDAdam + +
+
+ comment: 39 pages, ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Fréchet Wavelet Distance: A Domain-Agnostic Metric for Image + Generation + + +
+ Modern metrics for generative learning like Fr\'echet Inception Distance +(FID) and DINOv2-Fr\'echet Distance (FD-DINOv2) demonstrate impressive +performance. However, they suffer from various shortcomings, like a bias +towards specific generators and datasets. To address this problem, we propose +the Fr\'echet Wavelet Distance (FWD) as a domain-agnostic metric based on the +Wavelet Packet Transform ($W_p$). FWD provides a sight across a broad spectrum +of frequencies in images with a high resolution, preserving both spatial and +textural aspects. Specifically, we use $W_p$ to project generated and real +images to the packet coefficient space. We then compute the Fr\'echet distance +with the resultant coefficients to evaluate the quality of a generator. This +metric is general-purpose and dataset-domain agnostic, as it does not rely on +any pre-trained network, while being more interpretable due to its ability to +compute Fr\'echet distance per packet, enhancing transparency. We conclude with +an extensive evaluation of a wide variety of generators across various datasets +that the proposed FWD can generalize and improve robustness to domain shifts +and various corruptions compared to other metrics. + +
+
+
+
+
+ + ♻ ☆ Kolmogorov-Arnold PointNet: Deep learning for prediction of fluid fields + on irregular geometries + + +
+ Kolmogorov-Arnold Networks (KANs) have emerged as a promising alternative to +traditional Multilayer Perceptrons (MLPs) in deep learning. KANs have already +been integrated into various architectures, such as convolutional neural +networks, graph neural networks, and transformers, and their potential has been +assessed for predicting physical quantities. However, the combination of KANs +with point-cloud-based neural networks (e.g., PointNet) for computational +physics has not yet been explored. To address this, we present +Kolmogorov-Arnold PointNet (KA-PointNet) as a novel supervised deep learning +framework for the prediction of incompressible steady-state fluid flow fields +in irregular domains, where the predicted fields are a function of the geometry +of the domains. In KA-PointNet, we implement shared KANs in the segmentation +branch of the PointNet architecture. We utilize Jacobi polynomials to construct +shared KANs. As a benchmark test case, we consider incompressible laminar +steady-state flow over a cylinder, where the geometry of its cross-section +varies over the data set. We investigate the performance of Jacobi polynomials +with different degrees as well as special cases of Jacobi polynomials such as +Legendre polynomials, Chebyshev polynomials of the first and second kinds, and +Gegenbauer polynomials, in terms of the computational cost of training and +accuracy of prediction of the test set. Additionally, we compare the +performance of PointNet with shared KANs (i.e., KA-PointNet) and PointNet with +shared MLPs. It is observed that when the number of trainable parameters is +approximately equal, PointNet with shared KANs (i.e., KA-PointNet) outperforms +PointNet with shared MLPs. Moreover, KA-PointNet predicts the pressure and +velocity distributions along the surface of cylinders more accurately, +resulting in more precise computations of lift and drag. + +
+
+
+
+
+ + ♻ ☆ Distributed Speculative Inference (DSI): Speculation Parallelism for + Provably Faster Lossless Language Model Inference ICLR 2025 + + +
+ This paper introduces distributed speculative inference (DSI), a novel +inference algorithm that is provably faster than speculative inference (SI) +[leviathan2023, chen2023, miao2024, sun2025, timor2025] and standard +autoregressive inference (non-SI). Like other SI algorithms, DSI operates on +frozen language models (LMs), requiring no training or architectural +modifications, and it preserves the target distribution. Prior studies on SI +have demonstrated empirical speedups over non-SI--but rely on sufficiently fast +and accurate drafters, which are often unavailable in practice. We identify a +gap where SI can be slower than non-SI if drafters are too slow or inaccurate. +We close this gap by proving that DSI is faster than both SI and non-SI--given +any drafters. DSI is therefore not only faster than SI, but also unlocks the +acceleration of LMs for which SI fails. DSI leverages speculation parallelism +(SP), a novel type of task parallelism, to orchestrate target and drafter +instances that overlap in time, establishing a new foundational tradeoff +between computational resources and latency. Our simulations show that DSI is +1.29-1.92x faster than SI in single-node setups for various off-the-shelf LMs +and tasks. We open-source all our code. + +
+
+ comment: Published at ICLR 2025. (Link: + https://openreview.net/forum?id=cJd1BgZ9CS) +
+
+
+
+
+ + ♻ ☆ TESGNN: Temporal Equivariant Scene Graph Neural Networks for Efficient + and Robust Multi-View 3D Scene Understanding + + +
+ Scene graphs have proven to be highly effective for various scene +understanding tasks due to their compact and explicit representation of +relational information. However, current methods often overlook the critical +importance of preserving symmetry when generating scene graphs from 3D point +clouds, which can lead to reduced accuracy and robustness, particularly when +dealing with noisy, multi-view data. Furthermore, a major limitation of prior +approaches is the lack of temporal modeling to capture time-dependent +relationships among dynamically evolving entities in a scene. To address these +challenges, we propose Temporal Equivariant Scene Graph Neural Network +(TESGNN), consisting of two key components: (1) an Equivariant Scene Graph +Neural Network (ESGNN), which extracts information from 3D point clouds to +generate scene graph while preserving crucial symmetry properties, and (2) a +Temporal Graph Matching Network, which fuses scene graphs generated by ESGNN +across multiple time sequences into a unified global representation using an +approximate graph-matching algorithm. Our combined architecture TESGNN +outperforms current state-of-the-art methods in scene graph generation, +achieving higher accuracy and faster training convergence. Moreover, we show +that leveraging the symmetry-preserving property produces a more stable and +accurate global scene representation compared to existing approaches. Last but +not least, it is computationally efficient and easily implementable using +existing frameworks, making it well-suited for real-time applications in +robotics and computer vision. This approach paves the way for more robust and +scalable solutions to complex multi-view scene understanding challenges. Our +source code is publicly available at: https://github.com/HySonLab/TESGraph + +
+
+ comment: arXiv admin note: text overlap with arXiv:2407.00609 +
+
+
+
+
+ + ♻ ☆ Unmasking Social Bots: How Confident Are We? + + +
+ Social bots remain a major vector for spreading disinformation on social +media and a menace to the public. Despite the progress made in developing +multiple sophisticated social bot detection algorithms and tools, bot detection +remains a challenging, unsolved problem that is fraught with uncertainty due to +the heterogeneity of bot behaviors, training data, and detection algorithms. +Detection models often disagree on whether to label the same account as bot or +human-controlled. However, they do not provide any measure of uncertainty to +indicate how much we should trust their results. We propose to address both bot +detection and the quantification of uncertainty at the account level - a novel +feature of this research. This dual focus is crucial as it allows us to +leverage additional information related to the quantified uncertainty of each +prediction, thereby enhancing decision-making and improving the reliability of +bot classifications. Specifically, our approach facilitates targeted +interventions for bots when predictions are made with high confidence and +suggests caution (e.g., gathering more data) when predictions are uncertain. + +
+
+ comment: 15 pages, 6 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ The Optimization Landscape of SGD Across the Feature Learning Strength ICLR 2025 + + +
+ We consider neural networks (NNs) where the final layer is down-scaled by a +fixed hyperparameter $\gamma$. Recent work has identified $\gamma$ as +controlling the strength of feature learning. As $\gamma$ increases, network +evolution changes from "lazy" kernel dynamics to "rich" feature-learning +dynamics, with a host of associated benefits including improved performance on +common tasks. In this work, we conduct a thorough empirical investigation of +the effect of scaling $\gamma$ across a variety of models and datasets in the +online training setting. We first examine the interaction of $\gamma$ with the +learning rate $\eta$, identifying several scaling regimes in the +$\gamma$-$\eta$ plane which we explain theoretically using a simple model. We +find that the optimal learning rate $\eta^*$ scales non-trivially with +$\gamma$. In particular, $\eta^* \propto \gamma^2$ when $\gamma \ll 1$ and +$\eta^* \propto \gamma^{2/L}$ when $\gamma \gg 1$ for a feed-forward network of +depth $L$. Using this optimal learning rate scaling, we proceed with an +empirical study of the under-explored "ultra-rich" $\gamma \gg 1$ regime. We +find that networks in this regime display characteristic loss curves, starting +with a long plateau followed by a drop-off, sometimes followed by one or more +additional staircase steps. We find networks of different large $\gamma$ values +optimize along similar trajectories up to a reparameterization of time. We +further find that optimal online performance is often found at large $\gamma$ +and could be missed if this hyperparameter is not tuned. Our findings indicate +that analytical study of the large-$\gamma$ limit may yield useful insights +into the dynamics of representation learning in performant models. + +
+
+ comment: ICLR 2025 Final Copy, 40 Pages, 45 figures +
+
+
+
+
+ + ♻ ☆ Detecting Unsuccessful Students in Cybersecurity Exercises in Two + Different Learning Environments + + +
+ This full paper in the research track evaluates the usage of data logged from +cybersecurity exercises in order to predict students who are potentially at +risk of performing poorly. Hands-on exercises are essential for learning since +they enable students to practice their skills. In cybersecurity, hands-on +exercises are often complex and require knowledge of many topics. Therefore, +students may miss solutions due to gaps in their knowledge and become +frustrated, which impedes their learning. Targeted aid by the instructor helps, +but since the instructor's time is limited, efficient ways to detect struggling +students are needed. This paper develops automated tools to predict when a +student is having difficulty. We formed a dataset with the actions of 313 +students from two countries and two learning environments: KYPO CRP and +EDURange. These data are used in machine learning algorithms to predict the +success of students in exercises deployed in these environments. After +extracting features from the data, we trained and cross-validated eight +classifiers for predicting the exercise outcome and evaluated their predictive +power. The contribution of this paper is comparing two approaches to feature +engineering, modeling, and classification performance on data from two learning +environments. Using the features from either learning environment, we were able +to detect and distinguish between successful and struggling students. A +decision tree classifier achieved the highest balanced accuracy and sensitivity +with data from both learning environments. The results show that activity data +from cybersecurity exercises are suitable for predicting student success. In a +potential application, such models can aid instructors in detecting struggling +students and providing targeted help. We publish data and code for building +these models so that others can adopt or adapt them. + +
+
+ comment: Published in the FIE 2024 conference proceedings, see + https://doi.org/10.1109/FIE61694.2024.10893135 +
+
+
+
+
+ + ♻ ☆ Differentiable Weightless Neural Networks + + +
+ We introduce the Differentiable Weightless Neural Network (DWN), a model +based on interconnected lookup tables. Training of DWNs is enabled by a novel +Extended Finite Difference technique for approximate differentiation of binary +values. We propose Learnable Mapping, Learnable Reduction, and Spectral +Regularization to further improve the accuracy and efficiency of these models. +We evaluate DWNs in three edge computing contexts: (1) an FPGA-based hardware +accelerator, where they demonstrate superior latency, throughput, energy +efficiency, and model area compared to state-of-the-art solutions, (2) a +low-power microcontroller, where they achieve preferable accuracy to XGBoost +while subject to stringent memory constraints, and (3) ultra-low-cost chips, +where they consistently outperform small models in both accuracy and projected +hardware area. DWNs also compare favorably against leading approaches for +tabular datasets, with higher average rank. Overall, our work positions DWNs as +a pioneering solution for edge-compatible high-throughput neural networks. + +
+
+
+
+
+ + ♻ ☆ Learning General-Purpose Biomedical Volume Representations using + Randomized Synthesis ICLR 2025 + + +
+ Current volumetric biomedical foundation models struggle to generalize as +public 3D datasets are small and do not cover the broad diversity of medical +procedures, conditions, anatomical regions, and imaging protocols. We address +this by creating a representation learning method that instead anticipates +strong domain shifts at training time itself. We first propose a data engine +that synthesizes highly variable training samples that would enable +generalization to new biomedical contexts. To then train a single 3D network +for any voxel-level task, we develop a contrastive learning method that +pretrains the network to be stable against nuisance imaging variation simulated +by the data engine, a key inductive bias for generalization. This network's +features can be used as robust representations of input images for downstream +tasks and its weights provide a strong, dataset-agnostic initialization for +finetuning on new datasets. As a result, we set new standards across both +multimodality registration and few-shot segmentation, a first for any 3D +biomedical vision model, all without (pre-)training on any existing dataset of +real images. + +
+
+ comment: ICLR 2025: International Conference on Learning Representations. Code + and model weights available at https://github.com/neel-dey/anatomix. + Keywords: synthetic data, representation learning, medical image analysis, + image registration, image segmentation +
+
+
+
+
+ + ♻ ☆ Prompting Fairness: Integrating Causality to Debias Large Language + Models + + +
+ Large language models (LLMs), despite their remarkable capabilities, are +susceptible to generating biased and discriminatory responses. As LLMs +increasingly influence high-stakes decision-making (e.g., hiring and +healthcare), mitigating these biases becomes critical. In this work, we propose +a causality-guided debiasing framework to tackle social biases, aiming to +reduce the objectionable dependence between LLMs' decisions and the social +information in the input. Our framework introduces a novel perspective to +identify how social information can affect an LLM's decision through different +causal pathways. Leveraging these causal insights, we outline principled +prompting strategies that regulate these pathways through selection mechanisms. +This framework not only unifies existing prompting-based debiasing techniques, +but also opens up new directions for reducing bias by encouraging the model to +prioritize fact-based reasoning over reliance on biased social cues. We +validate our framework through extensive experiments on real-world datasets +across multiple domains, demonstrating its effectiveness in debiasing LLM +decisions, even with only black-box access to the model. + +
+
+ comment: 24 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Explain Yourself, Briefly! Self-Explaining Neural Networks with Concise + Sufficient Reasons ICLR 2025 + + +
+ *Minimal sufficient reasons* represent a prevalent form of explanation - the +smallest subset of input features which, when held constant at their +corresponding values, ensure that the prediction remains unchanged. Previous +*post-hoc* methods attempt to obtain such explanations but face two main +limitations: (1) Obtaining these subsets poses a computational challenge, +leading most scalable methods to converge towards suboptimal, less meaningful +subsets; (2) These methods heavily rely on sampling out-of-distribution input +assignments, potentially resulting in counterintuitive behaviors. To tackle +these limitations, we propose in this work a self-supervised training approach, +which we term *sufficient subset training* (SST). Using SST, we train models to +generate concise sufficient reasons for their predictions as an integral part +of their output. Our results indicate that our framework produces succinct and +faithful subsets substantially more efficiently than competing post-hoc +methods, while maintaining comparable predictive performance. + +
+
+ comment: To appear in ICLR 2025 +
+
+
+
+
+ + ♻ ☆ SymDiff: Equivariant Diffusion via Stochastic Symmetrisation ICLR 2025 + + +
+ We propose SymDiff, a method for constructing equivariant diffusion models +using the framework of stochastic symmetrisation. SymDiff resembles a learned +data augmentation that is deployed at sampling time, and is lightweight, +computationally efficient, and easy to implement on top of arbitrary +off-the-shelf models. In contrast to previous work, SymDiff typically does not +require any neural network components that are intrinsically equivariant, +avoiding the need for complex parameterisations or the use of higher-order +geometric features. Instead, our method can leverage highly scalable modern +architectures as drop-in replacements for these more constrained alternatives. +We show that this additional flexibility yields significant empirical benefit +for $\mathrm{E}(3)$-equivariant molecular generation. To the best of our +knowledge, this is the first application of symmetrisation to generative +modelling, suggesting its potential in this domain more generally. + +
+
+ comment: Camera-ready version for ICLR 2025 +
+
+
+
+
+ + ♻ ☆ FedBiP: Heterogeneous One-Shot Federated Learning with Personalized + Latent Diffusion Models CVPR 2025 + + +
+ One-Shot Federated Learning (OSFL), a special decentralized machine learning +paradigm, has recently gained significant attention. OSFL requires only a +single round of client data or model upload, which reduces communication costs +and mitigates privacy threats compared to traditional FL. Despite these +promising prospects, existing methods face challenges due to client data +heterogeneity and limited data quantity when applied to real-world OSFL +systems. Recently, Latent Diffusion Models (LDM) have shown remarkable +advancements in synthesizing high-quality images through pretraining on +large-scale datasets, thereby presenting a potential solution to overcome these +issues. However, directly applying pretrained LDM to heterogeneous OSFL results +in significant distribution shifts in synthetic data, leading to performance +degradation in classification models trained on such data. This issue is +particularly pronounced in rare domains, such as medical imaging, which are +underrepresented in LDM's pretraining data. To address this challenge, we +propose Federated Bi-Level Personalization (FedBiP), which personalizes the +pretrained LDM at both instance-level and concept-level. Hereby, FedBiP +synthesizes images following the client's local data distribution without +compromising the privacy regulations. FedBiP is also the first approach to +simultaneously address feature space heterogeneity and client data scarcity in +OSFL. Our method is validated through extensive experiments on three OSFL +benchmarks with feature space heterogeneity, as well as on challenging medical +and satellite image datasets with label heterogeneity. The results demonstrate +the effectiveness of FedBiP, which substantially outperforms other OSFL +methods. + +
+
+ comment: CVPR 2025 +
+
+
+
+
+ + ♻ ☆ Towards Understanding the Universality of Transformers for Next-Token + Prediction ICLR 2025 + + +
+ Causal Transformers are trained to predict the next token for a given +context. While it is widely accepted that self-attention is crucial for +encoding the causal structure of sequences, the precise underlying mechanism +behind this in-context autoregressive learning ability remains unclear. In this +paper, we take a step towards understanding this phenomenon by studying the +approximation ability of Transformers for next-token prediction. Specifically, +we explore the capacity of causal Transformers to predict the next token +$x_{t+1}$ given an autoregressive sequence $(x_1, \dots, x_t)$ as a prompt, +where $ x_{t+1} = f(x_t) $, and $ f $ is a context-dependent function that +varies with each sequence. On the theoretical side, we focus on specific +instances, namely when $ f $ is linear or when $ (x_t)_{t \geq 1} $ is +periodic. We explicitly construct a Transformer (with linear, exponential, or +softmax attention) that learns the mapping $f$ in-context through a causal +kernel descent method. The causal kernel descent method we propose provably +estimates $x_{t+1} $ based solely on past and current observations $ (x_1, +\dots, x_t) $, with connections to the Kaczmarz algorithm in Hilbert spaces. We +present experimental results that validate our theoretical findings and suggest +their applicability to more general mappings $f$. + +
+
+ comment: ICLR 2025, 20 pages +
+
+
+
+
+ + ♻ ☆ Leveraging Dual Process Theory in Language Agent Framework for Real-time + Simultaneous Human-AI Collaboration + + +
+ Agents built on large language models (LLMs) have excelled in turn-by-turn +human-AI collaboration but struggle with simultaneous tasks requiring real-time +interaction. Latency issues and the challenge of inferring variable human +strategies hinder their ability to make autonomous decisions without explicit +instructions. Through experiments with current independent System 1 and System +2 methods, we validate the necessity of using Dual Process Theory (DPT) in +real-time tasks. We propose DPT-Agent, a novel language agent framework that +integrates System 1 and System 2 for efficient real-time simultaneous human-AI +collaboration. DPT-Agent's System 1 uses a Finite-state Machine (FSM) and +code-as-policy for fast, intuitive, and controllable decision-making. +DPT-Agent's System 2 integrates Theory of Mind (ToM) and asynchronous +reflection to infer human intentions and perform reasoning-based autonomous +decisions. We demonstrate the effectiveness of DPT-Agent through further +experiments with rule-based agents and human collaborators, showing significant +improvements over mainstream LLM-based frameworks. DPT-Agent can effectively +help LLMs convert correct slow thinking and reasoning into executable actions, +thereby improving performance. To the best of our knowledge, DPT-Agent is the +first language agent framework that achieves successful real-time simultaneous +human-AI collaboration autonomously. Code of DPT-Agent can be found in +https://github.com/sjtu-marl/DPT-Agent. + +
+
+ comment: Preprint under review. Update the experimental results of the + DeepSeek-R1 series models, o3-mini-high and o3-mini-medium +
+
+
+
+
+ + ♻ ☆ Bidirectional Consistency Models ICML 2024 + + +
+ Diffusion models (DMs) are capable of generating remarkably high-quality +samples by iteratively denoising a random vector, a process that corresponds to +moving along the probability flow ordinary differential equation (PF ODE). +Interestingly, DMs can also invert an input image to noise by moving backward +along the PF ODE, a key operation for downstream tasks such as interpolation +and image editing. However, the iterative nature of this process restricts its +speed, hindering its broader application. Recently, Consistency Models (CMs) +have emerged to address this challenge by approximating the integral of the PF +ODE, largely reducing the number of iterations. Yet, the absence of an explicit +ODE solver complicates the inversion process. To resolve this, we introduce +Bidirectional Consistency Model (BCM), which learns a single neural network +that enables both forward and backward traversal along the PF ODE, efficiently +unifying generation and inversion tasks within one framework. We can train BCM +from scratch or tune it using a pretrained consistency model, which reduces the +training cost and increases scalability. We demonstrate that BCM enables +one-step generation and inversion while also allowing the use of additional +steps to enhance generation quality or reduce reconstruction error. We further +showcase BCM's capability in downstream tasks, such as interpolation and +inpainting. Our code and weights are available at +https://github.com/Mosasaur5526/BCM-iCT-torch. + +
+
+ comment: 39 pages, 27 figures; a shorter version of this paper was acceppted + at the ICML 2024 Workshop on Structured Probabilistic Inference & Generative + Modeling +
+
+
+
+
+ + ♻ ☆ MoE-CAP: Benchmarking Cost, Accuracy and Performance of Sparse + Mixture-of-Experts Systems + + +
+ The Mixture-of-Experts (MoE) architecture is increasingly favored for scaling +Large Language Models (LLMs). Its key feature, sparse activation, selectively +activates only a subset of parameters (experts) per token, reducing memory +bandwidth and compute FLOPs compared to dense models. To capitalize on this, +MoE designers leverage heterogeneous compute and memory hardware to lower +system costs. However, the interaction between model sparsity and hardware +heterogeneity introduces trade-offs in Cost, Accuracy, and Performance (CAP). +To address this, we introduce MoE-CAP, a benchmarking method for evaluating +sparse MoE systems across these three dimensions. Its key innovation is a +sparsity-aware CAP analysis model, the first to integrate cost, performance, +and accuracy metrics into a single diagram while estimating the impact of +sparsity on system performance. MoE-CAP helps practitioners optimize hardware +provisioning for an MoE model-or vice versa. MoE-CAP supports various MoE +models and provides more accurate metrics than existing methods. + +
+
+
+
+
+ + ♻ ☆ Foundational Policy Acquisition via Multitask Learning for Motor Skill + Generation + + +
+ In this study, we propose a multitask reinforcement learning algorithm for +foundational policy acquisition to generate novel motor skills. +\textcolor{\hcolor}{Learning the rich representation of the multitask policy is +a challenge in dynamic movement generation tasks because the policy needs to +cope with changes in goals or environments with different reward functions or +physical parameters. Inspired by human sensorimotor adaptation mechanisms, we +developed the learning pipeline to construct the encoder-decoder networks and +network selection to facilitate foundational policy acquisition under multiple +situations. First, we compared the proposed method with previous multitask +reinforcement learning methods in the standard multi-locomotion tasks. The +results showed that the proposed approach outperformed the baseline methods. +Then, we applied the proposed method to the ball heading task using a monopod +robot model to evaluate skill generation performance. The results showed that +the proposed method was able to adapt to novel target positions or +inexperienced ball restitution coefficients but to acquire a foundational +policy network, originally learned for heading motion, which can generate an +entirely new overhead kicking skill. + +
+
+ comment: 11 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Phase-Amplitude Reduction-Based Imitation Learning + + +
+ In this study, we propose the use of the phase-amplitude reduction method to +construct an imitation learning framework. Imitating human movement +trajectories is recognized as a promising strategy for generating a range of +human-like robot movements. Unlike previous dynamical system-based imitation +learning approaches, our proposed method allows the robot not only to imitate a +limit cycle trajectory but also to replicate the transient movement from the +initial or disturbed state to the limit cycle. Consequently, our method offers +a safer imitation learning approach that avoids generating unpredictable +motions immediately after disturbances or from a specified initial state. We +first validated our proposed method by reconstructing a simple limit-cycle +attractor. We then compared the proposed approach with a conventional method on +a lemniscate trajectory tracking task with a simulated robot arm. Our findings +confirm that our proposed method can more accurately generate transient +movements to converge on a target periodic attractor compared to the previous +standard approach. Subsequently, we applied our method to a real robot arm to +imitate periodic human movements. + +
+
+ comment: 21 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Nonparametric Heterogeneous Long-term Causal Effect Estimation via Data + Combination + + +
+ Long-term causal inference has drawn increasing attention in many scientific +domains. Existing methods mainly focus on estimating average long-term causal +effects by combining long-term observational data and short-term experimental +data. However, it is still understudied how to robustly and effectively +estimate heterogeneous long-term causal effects, significantly limiting +practical applications. In this paper, we propose several two-stage style +nonparametric estimators for heterogeneous long-term causal effect estimation, +including propensity-based, regression-based, and multiple robust estimators. +We conduct a comprehensive theoretical analysis of their asymptotic properties +under mild assumptions, with the ultimate goal of building a better +understanding of the conditions under which some estimators can be expected to +perform better. Extensive experiments across several semi-synthetic and +real-world datasets validate the theoretical results and demonstrate the +effectiveness of the proposed estimators. + +
+
+
+
+
+ + ♻ ☆ TradingAgents: Multi-Agents LLM Financial Trading Framework AAAI 2025 + + +
+ Significant progress has been made in automated problem-solving using +societies of agents powered by large language models (LLMs). In finance, +efforts have largely focused on single-agent systems handling specific tasks or +multi-agent frameworks independently gathering data. However, multi-agent +systems' potential to replicate real-world trading firms' collaborative +dynamics remains underexplored. TradingAgents proposes a novel stock trading +framework inspired by trading firms, featuring LLM-powered agents in +specialized roles such as fundamental analysts, sentiment analysts, technical +analysts, and traders with varied risk profiles. The framework includes Bull +and Bear researcher agents assessing market conditions, a risk management team +monitoring exposure, and traders synthesizing insights from debates and +historical data to make informed decisions. By simulating a dynamic, +collaborative trading environment, this framework aims to improve trading +performance. Detailed architecture and extensive experiments reveal its +superiority over baseline models, with notable improvements in cumulative +returns, Sharpe ratio, and maximum drawdown, highlighting the potential of +multi-agent LLM frameworks in financial trading. TradingAgents is available at +https://github.com/PioneerFintech. + +
+
+ comment: Multi-Agent AI in the Real World @ AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Video-Foley: Two-Stage Video-To-Sound Generation via Temporal Event + Condition For Foley Sound + + +
+ Foley sound synthesis is crucial for multimedia production, enhancing user +experience by synchronizing audio and video both temporally and semantically. +Recent studies on automating this labor-intensive process through +video-to-sound generation face significant challenges. Systems lacking explicit +temporal features suffer from poor alignment and controllability, while +timestamp-based models require costly and subjective human annotation. We +propose Video-Foley, a video-to-sound system using Root Mean Square (RMS) as an +intuitive condition with semantic timbre prompts (audio or text). RMS, a +frame-level intensity envelope closely related to audio semantics, acts as a +temporal event feature to guide audio generation from video. The +annotation-free self-supervised learning framework consists of two stages, +Video2RMS and RMS2Sound, incorporating novel ideas including RMS discretization +and RMS-ControlNet with a pretrained text-to-audio model. Our extensive +evaluation shows that Video-Foley achieves state-of-the-art performance in +audio-visual alignment and controllability for sound timing, intensity, timbre, +and nuance. Source code, model weights and demos are available on our companion +website. (https://jnwnlee.github.io/video-foley-demo) + +
+
+
+
+
+ + ♻ ☆ Audio-Visual Instance Segmentation CVPR 2025 + + +
+ In this paper, we propose a new multi-modal task, termed audio-visual +instance segmentation (AVIS), which aims to simultaneously identify, segment +and track individual sounding object instances in audible videos. To facilitate +this research, we introduce a high-quality benchmark named AVISeg, containing +over 90K instance masks from 26 semantic categories in 926 long videos. +Additionally, we propose a strong baseline model for this task. Our model first +localizes sound source within each frame, and condenses object-specific +contexts into concise tokens. Then it builds long-range audio-visual +dependencies between these tokens using window-based attention, and tracks +sounding objects among the entire video sequences. Extensive experiments reveal +that our method performs best on AVISeg, surpassing the existing methods from +related tasks. We further conduct the evaluation on several multi-modal large +models. Unfortunately, they exhibits subpar performance on instance-level sound +source localization and temporal perception. We expect that AVIS will inspire +the community towards a more comprehensive multi-modal understanding. Dataset +and code is available at https://github.com/ruohaoguo/avis. + +
+
+ comment: Accepted by CVPR 2025 +
+
+
+
+
+ + ♻ ☆ GAMED: Knowledge Adaptive Multi-Experts Decoupling for Multimodal Fake + News Detection + + +
+ Multimodal fake news detection often involves modelling heterogeneous data +sources, such as vision and language. Existing detection methods typically rely +on fusion effectiveness and cross-modal consistency to model the content, +complicating understanding how each modality affects prediction accuracy. +Additionally, these methods are primarily based on static feature modelling, +making it difficult to adapt to the dynamic changes and relationships between +different data modalities. This paper develops a significantly novel approach, +GAMED, for multimodal modelling, which focuses on generating distinctive and +discriminative features through modal decoupling to enhance cross-modal +synergies, thereby optimizing overall performance in the detection process. +GAMED leverages multiple parallel expert networks to refine features and +pre-embed semantic knowledge to improve the experts' ability in information +selection and viewpoint sharing. Subsequently, the feature distribution of each +modality is adaptively adjusted based on the respective experts' opinions. +GAMED also introduces a novel classification technique to dynamically manage +contributions from different modalities, while improving the explainability of +decisions. Experimental results on the Fakeddit and Yang datasets demonstrate +that GAMED performs better than recently developed state-of-the-art models. The +source code can be accessed at https://github.com/slz0925/GAMED. + +
+
+
+
+
+ + ♻ ☆ Improving LSH via Tensorized Random Projection + + +
+ Locality sensitive hashing (LSH) is a fundamental algorithmic toolkit used by +data scientists for approximate nearest neighbour search problems that have +been used extensively in many large scale data processing applications such as +near duplicate detection, nearest neighbour search, clustering, etc. In this +work, we aim to propose faster and space efficient locality sensitive hash +functions for Euclidean distance and cosine similarity for tensor data. +Typically, the naive approach for obtaining LSH for tensor data involves first +reshaping the tensor into vectors, followed by applying existing LSH methods +for vector data $E2LSH$ and $SRP$. However, this approach becomes impractical +for higher order tensors because the size of the reshaped vector becomes +exponential in the order of the tensor. Consequently, the size of LSH +parameters increases exponentially. To address this problem, we suggest two +methods for LSH for Euclidean distance and cosine similarity, namely +$CP-E2LSH$, $TT-E2LSH$, and $CP-SRP$, $TT-SRP$, respectively, building on $CP$ +and tensor train $(TT)$ decompositions techniques. Our approaches are space +efficient and can be efficiently applied to low rank $CP$ or $TT$ tensors. We +provide a rigorous theoretical analysis of our proposal on their correctness +and efficacy. + +
+
+
+
+
+ + ♻ ☆ Where is the Testbed for my Federated Learning Research? + + +
+ Progressing beyond centralized AI is of paramount importance, yet, +distributed AI solutions, in particular various federated learning (FL) +algorithms, are often not comprehensively assessed, which prevents the research +community from identifying the most promising approaches and practitioners from +being convinced that a certain solution is deployment-ready. The largest hurdle +towards FL algorithm evaluation is the difficulty of conducting real-world +experiments over a variety of FL client devices and different platforms, with +different datasets and data distribution, all while assessing various +dimensions of algorithm performance, such as inference accuracy, energy +consumption, and time to convergence, to name a few. In this paper, we present +CoLExT, a real-world testbed for FL research. CoLExT is designed to streamline +experimentation with custom FL algorithms in a rich testbed configuration +space, with a large number of heterogeneous edge devices, ranging from +single-board computers to smartphones, and provides real-time collection and +visualization of a variety of metrics through automatic instrumentation. +According to our evaluation, porting FL algorithms to CoLExT requires minimal +involvement from the developer, and the instrumentation introduces minimal +resource usage overhead. Furthermore, through an initial investigation +involving popular FL algorithms running on CoLExT, we reveal previously unknown +trade-offs, inefficiencies, and programming bugs. + +
+
+ comment: SEC 2024 +
+
+
+
+
+ + ♻ ☆ When Attention Sink Emerges in Language Models: An Empirical View ICLR 2025 + + +
+ Language Models (LMs) assign significant attention to the first token, even +if it is not semantically important, which is known as attention sink. This +phenomenon has been widely adopted in applications such as streaming/long +context generation, KV cache optimization, inference acceleration, model +quantization, and others. Despite its widespread use, a deep understanding of +attention sink in LMs is still lacking. In this work, we first demonstrate that +attention sinks exist universally in LMs with various inputs, even in small +models. Furthermore, attention sink is observed to emerge during the LM +pre-training, motivating us to investigate how optimization, data distribution, +loss function, and model architecture in LM pre-training influence its +emergence. We highlight that attention sink emerges after effective +optimization on sufficient training data. The sink position is highly +correlated with the loss function and data distribution. Most importantly, we +find that attention sink acts more like key biases, storing extra attention +scores, which could be non-informative and not contribute to the value +computation. We also observe that this phenomenon (at least partially) stems +from tokens' inner dependence on attention scores as a result of softmax +normalization. After relaxing such dependence by replacing softmax attention +with other attention operations, such as sigmoid attention without +normalization, attention sinks do not emerge in LMs up to 1B parameters. The +code is available at https://github.com/sail-sg/Attention-Sink. + +
+
+ comment: ICLR 2025 (Spotlight) +
+
+
+
+
+ + ♻ ☆ Cheating Automatic LLM Benchmarks: Null Models Achieve High Win Rates ICLR 2025 + + +
+ Automatic LLM benchmarks, such as AlpacaEval 2.0, Arena-Hard-Auto, and +MT-Bench, have become popular for evaluating language models due to their +cost-effectiveness and scalability compared to human evaluation. Achieving high +win rates on these benchmarks can significantly boost the promotional impact of +newly released language models. This promotional benefit may motivate tricks, +such as manipulating model output length or style to game win rates, even +though several mechanisms have been developed to control length and disentangle +style to reduce gameability. Nonetheless, we show that even a "null model" that +always outputs a constant response (irrelevant to input instructions) can cheat +automatic benchmarks and achieve top-ranked win rates: an 86.5% LC win rate on +AlpacaEval 2.0; an 83.0 score on Arena-Hard-Auto; and a 9.55 score on MT-Bench. +Moreover, the crafted cheating outputs are transferable because we assume that +the instructions of these benchmarks (e.g., 805 samples of AlpacaEval 2.0) are +private and cannot be accessed. While our experiments are primarily +proof-of-concept, an adversary could use LLMs to generate more imperceptible +cheating responses, unethically benefiting from high win rates and promotional +impact. Our findings call for the development of anti-cheating mechanisms for +reliable automatic benchmarks. The code is available at +https://github.com/sail-sg/Cheating-LLM-Benchmarks. + +
+
+ comment: ICLR 2025 (Oral) +
+
+
+
+
+ + ♻ ☆ Graph Transformers Dream of Electric Flow + + +
+ We show theoretically and empirically that the linear Transformer, when +applied to graph data, can implement algorithms that solve canonical problems +such as electric flow and eigenvector decomposition. The Transformer has access +to information on the input graph only via the graph's incidence matrix. We +present explicit weight configurations for implementing each algorithm, and we +bound the constructed Transformers' errors by the errors of the underlying +algorithms. Our theoretical findings are corroborated by experiments on +synthetic data. Additionally, on a real-world molecular regression task, we +observe that the linear Transformer is capable of learning a more effective +positional encoding than the default one based on Laplacian eigenvectors. Our +work is an initial step towards elucidating the inner-workings of the +Transformer for graph data. Code is available at +https://github.com/chengxiang/LinearGraphTransformer + +
+
+
+
+
+ + ♻ ☆ Accelerating 3D Molecule Generation via Jointly Geometric Optimal + Transport ICLR 2025 + + +
+ This paper proposes a new 3D molecule generation framework, called GOAT, for +fast and effective 3D molecule generation based on the flow-matching optimal +transport objective. Specifically, we formulate a geometric transport formula +for measuring the cost of mapping multi-modal features (e.g., continuous atom +coordinates and categorical atom types) between a base distribution and a +target data distribution. Our formula is solved within a joint, equivariant, +and smooth representation space. This is achieved by transforming the +multi-modal features into a continuous latent space with equivariant networks. +In addition, we find that identifying optimal distributional coupling is +necessary for fast and effective transport between any two distributions. We +further propose a mechanism for estimating and purifying optimal coupling to +train the flow model with optimal transport. By doing so, GOAT can turn +arbitrary distribution couplings into new deterministic couplings, leading to +an estimated optimal transport plan for fast 3D molecule generation. The +purification filters out the subpar molecules to ensure the ultimate generation +quality. We theoretically and empirically prove that the proposed optimal +coupling estimation and purification yield transport plan with non-increasing +cost. Finally, extensive experiments show that GOAT enjoys the efficiency of +solving geometric optimal transport, leading to a double speedup compared to +the sub-optimal method while achieving the best generation quality regarding +validity, uniqueness, and novelty. The code is available at +https://github.com/WanyuGroup/ICLR2025-GOAT. + +
+
+ comment: Published as a conference paper at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Knowledge Gradient for Multi-Objective Bayesian Optimization with + Decoupled Evaluations + + +
+ Multi-objective Bayesian optimization aims to find the Pareto front of +trade-offs between a set of expensive objectives while collecting as few +samples as possible. In some cases, it is possible to evaluate the objectives +separately, and a different latency or evaluation cost can be associated with +each objective. This decoupling of the objectives presents an opportunity to +learn the Pareto front faster by avoiding unnecessary, expensive evaluations. +We propose a scalarization based knowledge gradient acquisition function which +accounts for the different evaluation costs of the objectives. We prove +asymptotic consistency of the estimator of the optimum for an arbitrary, +D-dimensional, real compact search space and show empirically that the +algorithm performs comparably with the state of the art and significantly +outperforms versions which always evaluate both objectives. + +
+
+ comment: 36 pages. This preprint has not undergone peer review (when + applicable) or any post-submission improvements or corrections. The Version + of Record of this contribution is published in 'Evolutionary Multi-Criterion + Optimization', LNCS 15513, and is available online at + https://doi.org/10.1007/978-981-96-3538-2_9 +
+
+
+
+
+ + ♻ ☆ UMGAD: Unsupervised Multiplex Graph Anomaly Detection + + +
+ Graph anomaly detection (GAD) is a critical task in graph machine learning, +with the primary objective of identifying anomalous nodes that deviate +significantly from the majority. This task is widely applied in various +real-world scenarios, including fraud detection and social network analysis. +However, existing GAD methods still face two major challenges: (1) They are +often limited to detecting anomalies in single-type interaction graphs and +struggle with multiple interaction types in multiplex heterogeneous graphs. (2) +In unsupervised scenarios, selecting appropriate anomaly score thresholds +remains a significant challenge for accurate anomaly detection. To address the +above challenges, we propose a novel Unsupervised Multiplex Graph Anomaly +Detection method, named UMGAD. We first learn multi-relational correlations +among nodes in multiplex heterogeneous graphs and capture anomaly information +during node attribute and structure reconstruction through graph-masked +autoencoder (GMAE). Then, to further extract abnormal information, we generate +attribute-level and subgraph-level augmented-view graphs respectively, and +perform attribute and structure reconstruction through GMAE. Finally, we learn +to optimize node attributes and structural features through contrastive +learning between original-view and augmented-view graphs to improve the model's +ability to capture anomalies. Meanwhile, we also propose a new anomaly score +threshold selection strategy, which allows the model to be independent of +ground truth information in real unsupervised scenarios. Extensive experiments +on four datasets show that our UMGAD significantly outperforms state-of-the-art +methods, achieving average improvements of 13.48% in AUC and 11.68% in Macro-F1 +across all datasets. + +
+
+
+
+
+ + ♻ ☆ MOVE: Effective and Harmless Ownership Verification via Embedded + External Features AAAI 2022 + + +
+ Currently, deep neural networks (DNNs) are widely adopted in different +applications. Despite its commercial values, training a well-performing DNN is +resource-consuming. Accordingly, the well-trained model is valuable +intellectual property for its owner. However, recent studies revealed the +threats of model stealing, where the adversaries can obtain a function-similar +copy of the victim model, even when they can only query the model. In this +paper, we propose an effective and harmless model ownership verification (MOVE) +to defend against different types of model stealing simultaneously, without +introducing new security risks. In general, we conduct the ownership +verification by verifying whether a suspicious model contains the knowledge of +defender-specified external features. Specifically, we embed the external +features by modifying a few training samples with style transfer. We then train +a meta-classifier to determine whether a model is stolen from the victim. This +approach is inspired by the understanding that the stolen models should contain +the knowledge of features learned by the victim model. In particular, +\revision{we develop our MOVE method under both white-box and black-box +settings and analyze its theoretical foundation to provide comprehensive model +protection.} Extensive experiments on benchmark datasets verify the +effectiveness of our method and its resistance to potential adaptive attacks. +The codes for reproducing the main experiments of our method are available at +https://github.com/THUYimingLi/MOVE. + +
+
+ comment: This paper has been accepted by IEEE TPAMI 2025. It is the journal + extension of our conference paper in AAAI 2022 + (https://ojs.aaai.org/index.php/AAAI/article/view/20036). 18 pages +
+
+
+
+
+ + ♻ ☆ Permutation-Invariant Graph Partitioning:How Graph Neural Networks + Capture Structural Interactions? + + +
+ Graph Neural Networks (GNNs) have paved the way for being a cornerstone in +graph-related learning tasks. Yet, the ability of GNNs to capture structural +interactions within graphs remains under-explored. In this work, we address +this gap by drawing on the insight that permutation invariant graph +partitioning enables a powerful way of exploring structural interactions. We +establish theoretical connections between permutation invariant graph +partitioning and graph isomorphism, and then propose Graph Partitioning Neural +Networks (GPNNs), a novel architecture that efficiently enhances the expressive +power of GNNs in learning structural interactions. We analyze how partitioning +schemes and structural interactions contribute to GNN expressivity and their +trade-offs with complexity. Empirically, we demonstrate that GPNNs outperform +existing GNN models in capturing structural interactions across diverse graph +benchmark tasks. + +
+
+
+
+
+ + ♻ ☆ Intrinsic Dimension Correlation: uncovering nonlinear connections in + multimodal representations ICLR 2025 + + +
+ To gain insight into the mechanisms behind machine learning methods, it is +crucial to establish connections among the features describing data points. +However, these correlations often exhibit a high-dimensional and strongly +nonlinear nature, which makes them challenging to detect using standard +methods. This paper exploits the entanglement between intrinsic dimensionality +and correlation to propose a metric that quantifies the (potentially nonlinear) +correlation between high-dimensional manifolds. We first validate our method on +synthetic data in controlled environments, showcasing its advantages and +drawbacks compared to existing techniques. Subsequently, we extend our analysis +to large-scale applications in neural network representations. Specifically, we +focus on latent representations of multimodal data, uncovering clear +correlations between paired visual and textual embeddings, whereas existing +methods struggle significantly in detecting similarity. Our results indicate +the presence of highly nonlinear correlation patterns between latent manifolds. + +
+
+ comment: Accepted at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Boosting Jailbreak Attack with Momentum ICASSP 2025 + + +
+ Large Language Models (LLMs) have achieved remarkable success across diverse +tasks, yet they remain vulnerable to adversarial attacks, notably the +well-known jailbreak attack. In particular, the Greedy Coordinate Gradient +(GCG) attack has demonstrated efficacy in exploiting this vulnerability by +optimizing adversarial prompts through a combination of gradient heuristics and +greedy search. However, the efficiency of this attack has become a bottleneck +in the attacking process. To mitigate this limitation, in this paper we rethink +the generation of the adversarial prompts through an optimization lens, aiming +to stabilize the optimization process and harness more heuristic insights from +previous optimization iterations. Specifically, we propose the +\textbf{M}omentum \textbf{A}ccelerated G\textbf{C}G (\textbf{MAC}) attack, +which integrates a momentum term into the gradient heuristic to boost and +stabilize the random search for tokens in adversarial prompts. Experimental +results showcase the notable enhancement achieved by MAC over baselines in +terms of attack success rate and optimization efficiency. Moreover, we +demonstrate that MAC can still exhibit superior performance for transfer +attacks and models under defense mechanisms. Our code is available at +https://github.com/weizeming/momentum-attack-llm. + +
+
+ comment: Accepted by ICASSP 2025 +
+
+
+
+
+ + ♻ ☆ PIG: Physics-Informed Gaussians as Adaptive Parametric Mesh + Representations + + +
+ The numerical approximation of partial differential equations (PDEs) using +neural networks has seen significant advancements through Physics-Informed +Neural Networks (PINNs). Despite their straightforward optimization framework +and flexibility in implementing various PDEs, PINNs often suffer from limited +accuracy due to the spectral bias of Multi-Layer Perceptrons (MLPs), which +struggle to effectively learn high-frequency and nonlinear components. +Recently, parametric mesh representations in combination with neural networks +have been investigated as a promising approach to eliminate the inductive bias +of MLPs. However, they usually require high-resolution grids and a large number +of collocation points to achieve high accuracy while avoiding overfitting. In +addition, the fixed positions of the mesh parameters restrict their +flexibility, making accurate approximation of complex PDEs challenging. To +overcome these limitations, we propose Physics-Informed Gaussians (PIGs), which +combine feature embeddings using Gaussian functions with a lightweight neural +network. Our approach uses trainable parameters for the mean and variance of +each Gaussian, allowing for dynamic adjustment of their positions and shapes +during training. This adaptability enables our model to optimally approximate +PDE solutions, unlike models with fixed parameter positions. Furthermore, the +proposed approach maintains the same optimization framework used in PINNs, +allowing us to benefit from their excellent properties. Experimental results +show the competitive performance of our model across various PDEs, +demonstrating its potential as a robust tool for solving complex PDEs. Our +project page is available at +https://namgyukang.github.io/Physics-Informed-Gaussians/ + +
+
+ comment: Project page: + https://namgyukang.github.io/Physics-Informed-Gaussians/ +
+
+
+
+
+ + ♻ ☆ Geometric Inductive Biases of Deep Networks: The Role of Data and + Architecture + + +
+ In this paper, we propose the $\textit{geometric invariance hypothesis +(GIH)}$, which argues that the input space curvature of a neural network +remains invariant under transformation in certain architecture-dependent +directions during training. We investigate a simple, non-linear binary +classification problem residing on a plane in a high dimensional space and +observe that$\unicode{x2014}$unlike MPLs$\unicode{x2014}$ResNets fail to +generalize depending on the orientation of the plane. Motivated by this +example, we define a neural network's $\textbf{average geometry}$ and +$\textbf{average geometry evolution}$ as compact +$\textit{architecture-dependent}$ summaries of the model's input-output +geometry and its evolution during training. By investigating the average +geometry evolution at initialization, we discover that the geometry of a neural +network evolves according to the data covariance projected onto its average +geometry. This means that the geometry only changes in a subset of the input +space when the average geometry is low-rank, such as in ResNets. This causes an +architecture-dependent invariance property in the input space curvature, which +we dub GIH. Finally, we present extensive experimental results to observe the +consequences of GIH and how it relates to generalization in neural networks. + +
+
+
+
+
+ + ♻ ☆ On Theoretical Limits of Learning with Label Differential Privacy + + +
+ Label differential privacy (DP) is designed for learning problems involving +private labels and public features. While various methods have been proposed +for learning under label DP, the theoretical limits remain largely unexplored. +In this paper, we investigate the fundamental limits of learning with label DP +in both local and central models for both classification and regression tasks, +characterized by minimax convergence rates. We establish lower bounds by +converting each task into a multiple hypothesis testing problem and bounding +the test error. Additionally, we develop algorithms that yield matching upper +bounds. Our results demonstrate that under label local DP (LDP), the risk has a +significantly faster convergence rate than that under full LDP, i.e. protecting +both features and labels, indicating the advantages of relaxing the DP +definition to focus solely on labels. In contrast, under the label central DP +(CDP), the risk is only reduced by a constant factor compared to full DP, +indicating that the relaxation of CDP only has limited benefits on the +performance. + +
+
+
+
+
+ + ♻ ☆ FOSP: Fine-tuning Offline Safe Policy through World Models ICLR2025 + + +
+ Offline Safe Reinforcement Learning (RL) seeks to address safety constraints +by learning from static datasets and restricting exploration. However, these +approaches heavily rely on the dataset and struggle to generalize to unseen +scenarios safely. In this paper, we aim to improve safety during the deployment +of vision-based robotic tasks through online fine-tuning an offline pretrained +policy. To facilitate effective fine-tuning, we introduce model-based RL, which +is known for its data efficiency. Specifically, our method employs in-sample +optimization to improve offline training efficiency while incorporating +reachability guidance to ensure safety. After obtaining an offline safe policy, +a safe policy expansion approach is leveraged for online fine-tuning. The +performance of our method is validated on simulation benchmarks with five +vision-only tasks and through real-world robot deployment using limited data. +It demonstrates that our approach significantly improves the generalization of +offline policies to unseen safety-constrained scenarios. To the best of our +knowledge, this is the first work to explore offline-to-online RL for safe +generalization tasks. + +
+
+ comment: 32 pages, ICLR2025 +
+
+
+
+
+ + ♻ ☆ Breaking the Reclustering Barrier in Centroid-based Deep Clustering ICLR 2025 + + +
+ This work investigates an important phenomenon in centroid-based deep +clustering (DC) algorithms: Performance quickly saturates after a period of +rapid early gains. Practitioners commonly address early saturation with +periodic reclustering, which we demonstrate to be insufficient to address +performance plateaus. We call this phenomenon the "reclustering barrier" and +empirically show when the reclustering barrier occurs, what its underlying +mechanisms are, and how it is possible to Break the Reclustering Barrier with +our algorithm BRB. BRB avoids early over-commitment to initial clusterings and +enables continuous adaptation to reinitialized clustering targets while +remaining conceptually simple. Applying our algorithm to widely-used +centroid-based DC algorithms, we show that (1) BRB consistently improves +performance across a wide range of clustering benchmarks, (2) BRB enables +training from scratch, and (3) BRB performs competitively against +state-of-the-art DC algorithms when combined with a contrastive loss. We +release our code and pre-trained models at +https://github.com/Probabilistic-and-Interactive-ML/breaking-the-reclustering-barrier . + +
+
+ comment: Accepted at ICLR 2025 (Camera-ready version) +
+
+
+
+
+ + ♻ ☆ Greener GRASS: Enhancing GNNs with Encoding, Rewiring, and Attention ICLR 2025 + + +
+ Graph Neural Networks (GNNs) have become important tools for machine learning +on graph-structured data. In this paper, we explore the synergistic combination +of graph encoding, graph rewiring, and graph attention, by introducing Graph +Attention with Stochastic Structures (GRASS), a novel GNN architecture. GRASS +utilizes relative random walk probabilities (RRWP) encoding and a novel +decomposed variant (D-RRWP) to efficiently capture structural information. It +rewires the input graph by superimposing a random regular graph to enhance +long-range information propagation. It also employs a novel additive attention +mechanism tailored for graph-structured data. Our empirical evaluations +demonstrate that GRASS achieves state-of-the-art performance on multiple +benchmark datasets, including a 20.3% reduction in mean absolute error on the +ZINC dataset. + +
+
+ comment: Published as a conference paper at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Robust Weight Initialization for Tanh Neural Networks with Fixed Point + Analysis ICLR 2025 + + +
+ As a neural network's depth increases, it can improve generalization +performance. However, training deep networks is challenging due to gradient and +signal propagation issues. To address these challenges, extensive theoretical +research and various methods have been introduced. Despite these advances, +effective weight initialization methods for tanh neural networks remain +insufficiently investigated. This paper presents a novel weight initialization +method for neural networks with tanh activation function. Based on an analysis +of the fixed points of the function $\tanh(ax)$, the proposed method aims to +determine values of $a$ that mitigate activation saturation. A series of +experiments on various classification datasets and physics-informed neural +networks demonstrates that the proposed method outperforms Xavier +initialization methods~(with or without normalization) in terms of robustness +across different network sizes, data efficiency, and convergence speed. Code is +available at https://github.com/1HyunwooLee/Tanh-Init + +
+
+ comment: ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Taxonomy, Opportunities, and Challenges of Representation Engineering + for Large Language Models + + +
+ Representation Engineering (RepE) is a novel paradigm for controlling the +behavior of LLMs. Unlike traditional approaches that modify inputs or fine-tune +the model, RepE directly manipulates the model's internal representations. As a +result, it may offer more effective, interpretable, data-efficient, and +flexible control over models' behavior. We present the first comprehensive +survey of RepE for LLMs, reviewing the rapidly growing literature to address +key questions: What RepE methods exist and how do they differ? For what +concepts and problems has RepE been applied? What are the strengths and +weaknesses of RepE compared to other methods? To answer these, we propose a +unified framework describing RepE as a pipeline comprising representation +identification, operationalization, and control. We posit that while RepE +methods offer significant potential, challenges remain, including managing +multiple concepts, ensuring reliability, and preserving models' performance. +Towards improving RepE, we identify opportunities for experimental and +methodological improvements and construct a guide for best practices. + +
+
+
+
+
+ + ♻ ☆ Timer-XL: Long-Context Transformers for Unified Time Series Forecasting + + +
+ We present Timer-XL, a causal Transformer for unified time series +forecasting. To uniformly predict multidimensional time series, we generalize +next token prediction, predominantly adopted for 1D token sequences, to +multivariate next token prediction. The paradigm formulates various forecasting +tasks as a long-context prediction problem. We opt for decoder-only +Transformers that capture causal dependencies from varying-length contexts for +unified forecasting, making predictions on non-stationary univariate time +series, multivariate series with complicated dynamics and correlations, as well +as covariate-informed contexts that include exogenous variables. Technically, +we propose a universal TimeAttention to capture fine-grained intra- and +inter-series dependencies of flattened time series tokens (patches), which is +further enhanced by deft position embedding for temporal causality and variable +equivalence. Timer-XL achieves state-of-the-art performance across +task-specific forecasting benchmarks through a unified approach. Based on +large-scale pre-training, Timer-XL achieves state-of-the-art zero-shot +performance, making it a promising architecture for pre-trained time series +models. Code is available at this repository: +https://github.com/thuml/Timer-XL. + +
+
+
+
+
+ + ♻ ☆ On the Asymptotic Mean Square Error Optimality of Diffusion Models + + +
+ Diffusion models (DMs) as generative priors have recently shown great +potential for denoising tasks but lack theoretical understanding with respect +to their mean square error (MSE) optimality. This paper proposes a novel +denoising strategy inspired by the structure of the MSE-optimal conditional +mean estimator (CME). The resulting DM-based denoiser can be conveniently +employed using a pre-trained DM, being particularly fast by truncating reverse +diffusion steps and not requiring stochastic re-sampling. We present a +comprehensive (non-)asymptotic optimality analysis of the proposed +diffusion-based denoiser, demonstrating polynomial-time convergence to the CME +under mild conditions. Our analysis also derives a novel Lipschitz constant +that depends solely on the DM's hyperparameters. Further, we offer a new +perspective on DMs, showing that they inherently combine an asymptotically +optimal denoiser with a powerful generator, modifiable by switching re-sampling +in the reverse process on or off. The theoretical findings are thoroughly +validated with experiments based on various benchmark datasets + +
+
+
+
+
+ + ♻ ☆ End-to-End Modeling Hierarchical Time Series Using Autoregressive + Transformer and Conditional Normalizing Flow based Reconciliation ICDM2022 + + +
+ Multivariate time series forecasting with hierarchical structure is pervasive +in real-world applications, demanding not only predicting each level of the +hierarchy, but also reconciling all forecasts to ensure coherency, i.e., the +forecasts should satisfy the hierarchical aggregation constraints. Moreover, +the disparities of statistical characteristics between levels can be huge, +worsened by non-Gaussian distributions and non-linear correlations. To this +extent, we propose a novel end-to-end hierarchical time series forecasting +model, based on conditioned normalizing flow-based autoregressive transformer +reconciliation, to represent complex data distribution while simultaneously +reconciling the forecasts to ensure coherency. Unlike other state-of-the-art +methods, we achieve the forecasting and reconciliation simultaneously without +requiring any explicit post-processing step. In addition, by harnessing the +power of deep model, we do not rely on any assumption such as unbiased +estimates or Gaussian distribution. Our evaluation experiments are conducted on +four real-world hierarchical datasets from different industrial domains (three +public ones and a dataset from the application servers of Alipay's data center) +and the preliminary results demonstrate efficacy of our proposed method. + +
+
+ comment: Accepted by the 22nd IEEE International Conference on Data Mining + (ICDM2022) +
+
+
+
+
+ + ♻ ☆ Training-Free Message Passing for Learning on Hypergraphs + + +
+ Hypergraphs are crucial for modelling higher-order interactions in real-world +data. Hypergraph neural networks (HNNs) effectively utilise these structures by +message passing to generate informative node features for various downstream +tasks like node classification. However, the message passing module in existing +HNNs typically requires a computationally intensive training process, which +limits their practical use. To tackle this challenge, we propose an alternative +approach by decoupling the usage of hypergraph structural information from the +model learning stage. This leads to a novel training-free message passing +module, named TF-MP-Module, which can be precomputed in the data preprocessing +stage, thereby reducing the computational burden. We refer to the hypergraph +neural network equipped with our TF-MP-Module as TF-HNN. We theoretically +support the efficiency and effectiveness of TF-HNN by showing that: 1) It is +more training-efficient compared to existing HNNs; 2) It utilises as much +information as existing HNNs for node feature generation; and 3) It is robust +against the oversmoothing issue while using long-range interactions. +Experiments based on seven real-world hypergraph benchmarks in node +classification and hyperlink prediction show that, compared to state-of-the-art +HNNs, TF-HNN exhibits both competitive performance and superior training +efficiency. Specifically, on the large-scale benchmark, Trivago, TF-HNN +outperforms the node classification accuracy of the best baseline by 10% with +just 1% of the training time of that baseline. + +
+
+
+
+
+ + ♻ ☆ DiscoGraMS: Enhancing Movie Screen-Play Summarization using Movie + Character-Aware Discourse Graph NAACL 2025 + + +
+ Summarizing movie screenplays presents a unique set of challenges compared to +standard document summarization. Screenplays are not only lengthy, but also +feature a complex interplay of characters, dialogues, and scenes, with numerous +direct and subtle relationships and contextual nuances that are difficult for +machine learning models to accurately capture and comprehend. Recent attempts +at screenplay summarization focus on fine-tuning transformer-based pre-trained +models, but these models often fall short in capturing long-term dependencies +and latent relationships, and frequently encounter the "lost in the middle" +issue. To address these challenges, we introduce DiscoGraMS, a novel resource +that represents movie scripts as a movie character-aware discourse graph (CaD +Graph). This approach is well-suited for various downstream tasks, such as +summarization, question-answering, and salience detection. The model aims to +preserve all salient information, offering a more comprehensive and faithful +representation of the screenplay's content. We further explore a baseline +method that combines the CaD Graph with the corresponding movie script through +a late fusion of graph and text modalities, and we present very initial +promising results. + +
+
+ comment: Accepted at NAACL 2025 (Main) +
+
+
+
+
+ + ♻ ☆ Dist Loss: Enhancing Regression in Few-Shot Region through Distribution + Distance Constraint + + +
+ Imbalanced data distributions are prevalent in real-world scenarios, posing +significant challenges in both imbalanced classification and imbalanced +regression tasks. They often cause deep learning models to overfit in areas of +high sample density (many-shot regions) while underperforming in areas of low +sample density (few-shot regions). This characteristic restricts the utility of +deep learning models in various sectors, notably healthcare, where areas with +few-shot data hold greater clinical relevance. While recent studies have shown +the benefits of incorporating distribution information in imbalanced +classification tasks, such strategies are rarely explored in imbalanced +regression. In this paper, we address this issue by introducing a novel loss +function, termed Dist Loss, designed to minimize the distribution distance +between the model's predictions and the target labels in a differentiable +manner, effectively integrating distribution information into model training. +Dist Loss enables deep learning models to regularize their output distribution +during training, effectively enhancing their focus on few-shot regions. We have +conducted extensive experiments across three datasets spanning computer vision +and healthcare: IMDB-WIKI-DIR, AgeDB-DIR, and ECG-Ka-DIR. The results +demonstrate that Dist Loss effectively mitigates the negative impact of +imbalanced data distribution on model performance, achieving state-of-the-art +results in sparse data regions. Furthermore, Dist Loss is easy to integrate, +complementing existing methods. + +
+
+
+
+
+ + ♻ ☆ Market-Derived Financial Sentiment Analysis: Context-Aware Language + Models for Crypto Forecasting + + +
+ Financial Sentiment Analysis (FSA) traditionally relies on human-annotated +sentiment labels to infer investor sentiment and forecast market movements. +However, inferring the potential market impact of words based on their +human-perceived intentions is inherently challenging. We hypothesize that the +historical market reactions to words, offer a more reliable indicator of their +potential impact on markets than subjective sentiment interpretations by human +annotators. To test this hypothesis, a market-derived labeling approach is +proposed to assign tweet labels based on ensuing short-term price trends, +enabling the language model to capture the relationship between textual signals +and market dynamics directly. A domain-specific language model was fine-tuned +on these labels, achieving up to an 11% improvement in short-term trend +prediction accuracy over traditional sentiment-based benchmarks. Moreover, by +incorporating market and temporal context through prompt-tuning, the proposed +context-aware language model demonstrated an accuracy of 89.6% on a curated +dataset of 227 impactful Bitcoin-related news events with significant market +impacts. Aggregating daily tweet predictions into trading signals, our method +outperformed traditional fusion models (which combine sentiment-based and +price-based predictions). It challenged the assumption that sentiment-based +signals are inferior to price-based predictions in forecasting market +movements. Backtesting these signals across three distinct market regimes +yielded robust Sharpe ratios of up to 5.07 in trending markets and 3.73 in +neutral markets. Our findings demonstrate that language models can serve as +effective short-term market predictors. This paradigm shift underscores the +untapped capabilities of language models in financial decision-making and opens +new avenues for market prediction applications. + +
+
+ comment: 13 pages, 6 figures +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ♻ ☆ FedBiP: Heterogeneous One-Shot Federated Learning with Personalized + Latent Diffusion Models CVPR 2025 + + +
+ One-Shot Federated Learning (OSFL), a special decentralized machine learning +paradigm, has recently gained significant attention. OSFL requires only a +single round of client data or model upload, which reduces communication costs +and mitigates privacy threats compared to traditional FL. Despite these +promising prospects, existing methods face challenges due to client data +heterogeneity and limited data quantity when applied to real-world OSFL +systems. Recently, Latent Diffusion Models (LDM) have shown remarkable +advancements in synthesizing high-quality images through pretraining on +large-scale datasets, thereby presenting a potential solution to overcome these +issues. However, directly applying pretrained LDM to heterogeneous OSFL results +in significant distribution shifts in synthetic data, leading to performance +degradation in classification models trained on such data. This issue is +particularly pronounced in rare domains, such as medical imaging, which are +underrepresented in LDM's pretraining data. To address this challenge, we +propose Federated Bi-Level Personalization (FedBiP), which personalizes the +pretrained LDM at both instance-level and concept-level. Hereby, FedBiP +synthesizes images following the client's local data distribution without +compromising the privacy regulations. FedBiP is also the first approach to +simultaneously address feature space heterogeneity and client data scarcity in +OSFL. Our method is validated through extensive experiments on three OSFL +benchmarks with feature space heterogeneity, as well as on challenging medical +and satellite image datasets with label heterogeneity. The results demonstrate +the effectiveness of FedBiP, which substantially outperforms other OSFL +methods. + +
+
+ comment: CVPR 2025 +
+
+
+
+
+ + ♻ ☆ Video-Foley: Two-Stage Video-To-Sound Generation via Temporal Event + Condition For Foley Sound + + +
+ Foley sound synthesis is crucial for multimedia production, enhancing user +experience by synchronizing audio and video both temporally and semantically. +Recent studies on automating this labor-intensive process through +video-to-sound generation face significant challenges. Systems lacking explicit +temporal features suffer from poor alignment and controllability, while +timestamp-based models require costly and subjective human annotation. We +propose Video-Foley, a video-to-sound system using Root Mean Square (RMS) as an +intuitive condition with semantic timbre prompts (audio or text). RMS, a +frame-level intensity envelope closely related to audio semantics, acts as a +temporal event feature to guide audio generation from video. The +annotation-free self-supervised learning framework consists of two stages, +Video2RMS and RMS2Sound, incorporating novel ideas including RMS discretization +and RMS-ControlNet with a pretrained text-to-audio model. Our extensive +evaluation shows that Video-Foley achieves state-of-the-art performance in +audio-visual alignment and controllability for sound timing, intensity, timbre, +and nuance. Source code, model weights and demos are available on our companion +website. (https://jnwnlee.github.io/video-foley-demo) + +
+
+
+
+
+ + ♻ ☆ Audio-Visual Instance Segmentation CVPR 2025 + + +
+ In this paper, we propose a new multi-modal task, termed audio-visual +instance segmentation (AVIS), which aims to simultaneously identify, segment +and track individual sounding object instances in audible videos. To facilitate +this research, we introduce a high-quality benchmark named AVISeg, containing +over 90K instance masks from 26 semantic categories in 926 long videos. +Additionally, we propose a strong baseline model for this task. Our model first +localizes sound source within each frame, and condenses object-specific +contexts into concise tokens. Then it builds long-range audio-visual +dependencies between these tokens using window-based attention, and tracks +sounding objects among the entire video sequences. Extensive experiments reveal +that our method performs best on AVISeg, surpassing the existing methods from +related tasks. We further conduct the evaluation on several multi-modal large +models. Unfortunately, they exhibits subpar performance on instance-level sound +source localization and temporal perception. We expect that AVIS will inspire +the community towards a more comprehensive multi-modal understanding. Dataset +and code is available at https://github.com/ruohaoguo/avis. + +
+
+ comment: Accepted by CVPR 2025 +
+
+
+
+
+ + ♻ ☆ Improving Long-Text Alignment for Text-to-Image Diffusion Models + + +
+ The rapid advancement of text-to-image (T2I) diffusion models has enabled +them to generate unprecedented results from given texts. However, as text +inputs become longer, existing encoding methods like CLIP face limitations, and +aligning the generated images with long texts becomes challenging. To tackle +these issues, we propose LongAlign, which includes a segment-level encoding +method for processing long texts and a decomposed preference optimization +method for effective alignment training. For segment-level encoding, long texts +are divided into multiple segments and processed separately. This method +overcomes the maximum input length limits of pretrained encoding models. For +preference optimization, we provide decomposed CLIP-based preference models to +fine-tune diffusion models. Specifically, to utilize CLIP-based preference +models for T2I alignment, we delve into their scoring mechanisms and find that +the preference scores can be decomposed into two components: a text-relevant +part that measures T2I alignment and a text-irrelevant part that assesses other +visual aspects of human preference. Additionally, we find that the +text-irrelevant part contributes to a common overfitting problem during +fine-tuning. To address this, we propose a reweighting strategy that assigns +different weights to these two components, thereby reducing overfitting and +enhancing alignment. After fine-tuning $512 \times 512$ Stable Diffusion (SD) +v1.5 for about 20 hours using our method, the fine-tuned SD outperforms +stronger foundation models in T2I alignment, such as PixArt-$\alpha$ and +Kandinsky v2.2. The code is available at +https://github.com/luping-liu/LongAlign. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 18 + +
+
+
+ + ♻ ☆ Joint Person Identity, Gender and Age Estimation from Hand Images using + Deep Multi-Task Representation Learning + + +
+ In this paper, we propose a multi-task representation learning framework to +jointly estimate the identity, gender and age of individuals from their hand +images for the purpose of criminal investigations since the hand images are +often the only available information in cases of serious crime such as sexual +abuse. We investigate different up-to-date deep learning architectures and +compare their performance for joint estimation of identity, gender and age from +hand images of perpetrators of serious crime. To simplify the age prediction, +we create age groups for the age estimation. We make extensive evaluations and +comparisons of both convolution-based and transformer-based deep learning +architectures on a publicly available 11k hands dataset. Our experimental +analysis shows that it is possible to efficiently estimate not only identity +but also other attributes such as gender and age of suspects jointly from hand +images for criminal investigations, which is crucial in assisting international +police forces in the court to identify and convict abusers. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2209.04821 +
+
+
+
+
+ + ♻ ☆ ViViDex: Learning Vision-based Dexterous Manipulation from Human Videos ICRA 2025 + + +
+ In this work, we aim to learn a unified vision-based policy for +multi-fingered robot hands to manipulate a variety of objects in diverse poses. +Though prior work has shown benefits of using human videos for policy learning, +performance gains have been limited by the noise in estimated trajectories. +Moreover, reliance on privileged object information such as ground-truth object +states further limits the applicability in realistic scenarios. To address +these limitations, we propose a new framework ViViDex to improve vision-based +policy learning from human videos. It first uses reinforcement learning with +trajectory guided rewards to train state-based policies for each video, +obtaining both visually natural and physically plausible trajectories from the +video. We then rollout successful episodes from state-based policies and train +a unified visual policy without using any privileged information. We propose +coordinate transformation to further enhance the visual point cloud +representation, and compare behavior cloning and diffusion policy for the +visual policy training. Experiments both in simulation and on the real robot +demonstrate that ViViDex outperforms state-of-the-art approaches on three +dexterous manipulation tasks. + +
+
+ comment: Accepted by ICRA 2025. Project Page: + https://zerchen.github.io/projects/vividex.html +
+
+
+
+
+ + ♻ ☆ VDT-Auto: End-to-end Autonomous Driving with VLM-Guided Diffusion + Transformers + + +
+ In autonomous driving, dynamic environment and corner cases pose significant +challenges to the robustness of ego vehicle's decision-making. To address these +challenges, commencing with the representation of state-action mapping in the +end-to-end autonomous driving paradigm, we introduce a novel pipeline, +VDT-Auto. Leveraging the advancement of the state understanding of Visual +Language Model (VLM), incorporating with diffusion Transformer-based action +generation, our VDT-Auto parses the environment geometrically and contextually +for the conditioning of the diffusion process. Geometrically, we use a +bird's-eye view (BEV) encoder to extract feature grids from the surrounding +images. Contextually, the structured output of our fine-tuned VLM is processed +into textual embeddings and noisy paths. During our diffusion process, the +added noise for the forward process is sampled from the noisy path output of +the fine-tuned VLM, while the extracted BEV feature grids and embedded texts +condition the reverse process of our diffusion Transformers. Our VDT-Auto +achieved 0.52m on average L2 errors and 21% on average collision rate in the +nuScenes open-loop planning evaluation. Moreover, the real-world demonstration +exhibited prominent generalizability of our VDT-Auto. The code and dataset will +be released after acceptance. + +
+
+ comment: Submitted paper +
+
+
+
+
+ + ♻ ☆ HDKD: Hybrid Data-Efficient Knowledge Distillation Network for Medical + Image Classification + + +
+ Vision Transformers (ViTs) have achieved significant advancement in computer +vision tasks due to their powerful modeling capacity. However, their +performance notably degrades when trained with insufficient data due to lack of +inherent inductive biases. Distilling knowledge and inductive biases from a +Convolutional Neural Network (CNN) teacher has emerged as an effective strategy +for enhancing the generalization of ViTs on limited datasets. Previous +approaches to Knowledge Distillation (KD) have pursued two primary paths: some +focused solely on distilling the logit distribution from CNN teacher to ViT +student, neglecting the rich semantic information present in intermediate +features due to the structural differences between them. Others integrated +feature distillation along with logit distillation, yet this introduced +alignment operations that limits the amount of knowledge transferred due to +mismatched architectures and increased the computational overhead. To this end, +this paper presents Hybrid Data-efficient Knowledge Distillation (HDKD) +paradigm which employs a CNN teacher and a hybrid student. The choice of hybrid +student serves two main aspects. First, it leverages the strengths of both +convolutions and transformers while sharing the convolutional structure with +the teacher model. Second, this shared structure enables the direct application +of feature distillation without any information loss or additional +computational overhead. Additionally, we propose an efficient light-weight +convolutional block named Mobile Channel-Spatial Attention (MBCSA), which +serves as the primary convolutional block in both teacher and student models. +Extensive experiments on two medical public datasets showcase the superiority +of HDKD over other state-of-the-art models and its computational efficiency. +Source code at: https://github.com/omarsherif200/HDKD + +
+
+
+
+
+ + ♻ ☆ Revisiting Text-to-Image Evaluation with Gecko: On Metrics, Prompts, and + Human Ratings ICLR 2025 + + +
+ While text-to-image (T2I) generative models have become ubiquitous, they do +not necessarily generate images that align with a given prompt. While previous +work has evaluated T2I alignment by proposing metrics, benchmarks, and +templates for collecting human judgements, the quality of these components is +not systematically measured. Human-rated prompt sets are generally small and +the reliability of the ratings -- and thereby the prompt set used to compare +models -- is not evaluated. We address this gap by performing an extensive +study evaluating auto-eval metrics and human templates. We provide three main +contributions: (1) We introduce a comprehensive skills-based benchmark that can +discriminate models across different human templates. This skills-based +benchmark categorises prompts into sub-skills, allowing a practitioner to +pinpoint not only which skills are challenging, but at what level of complexity +a skill becomes challenging. (2) We gather human ratings across four templates +and four T2I models for a total of >100K annotations. This allows us to +understand where differences arise due to inherent ambiguity in the prompt and +where they arise due to differences in metric and model quality. (3) Finally, +we introduce a new QA-based auto-eval metric that is better correlated with +human ratings than existing metrics for our new dataset, across different human +templates, and on TIFA160. + +
+
+ comment: Accepted to ICLR 2025 (Spotlight) +
+
+
+
+
+ + ♻ ☆ FitDiff: Robust monocular 3D facial shape and reflectance estimation + using Diffusion Models + + +
+ The remarkable progress in 3D face reconstruction has resulted in high-detail +and photorealistic facial representations. Recently, Diffusion Models have +revolutionized the capabilities of generative methods by surpassing the +performance of GANs. In this work, we present FitDiff, a diffusion-based 3D +facial avatar generative model. Leveraging diffusion principles, our model +accurately generates relightable facial avatars, utilizing an identity +embedding extracted from an "in-the-wild" 2D facial image. The introduced +multi-modal diffusion model is the first to concurrently output facial +reflectance maps (diffuse and specular albedo and normals) and shapes, +showcasing great generalization capabilities. It is solely trained on an +annotated subset of a public facial dataset, paired with 3D reconstructions. We +revisit the typical 3D facial fitting approach by guiding a reverse diffusion +process using perceptual and face recognition losses. Being the first 3D LDM +conditioned on face recognition embeddings, FitDiff reconstructs relightable +human avatars, that can be used as-is in common rendering engines, starting +only from an unconstrained facial image, and achieving state-of-the-art +performance. + +
+
+
+
+
+ + ♻ ☆ Towards Generalizable Vision-Language Robotic Manipulation: A Benchmark + and LLM-guided 3D Policy ICRA 2025 + + +
+ Generalizing language-conditioned robotic policies to new tasks remains a +significant challenge, hampered by the lack of suitable simulation benchmarks. +In this paper, we address this gap by introducing GemBench, a novel benchmark +to assess generalization capabilities of vision-language robotic manipulation +policies. GemBench incorporates seven general action primitives and four levels +of generalization, spanning novel placements, rigid and articulated objects, +and complex long-horizon tasks. We evaluate state-of-the-art approaches on +GemBench and also introduce a new method. Our approach 3D-LOTUS leverages rich +3D information for action prediction conditioned on language. While 3D-LOTUS +excels in both efficiency and performance on seen tasks, it struggles with +novel tasks. To address this, we present 3D-LOTUS++, a framework that +integrates 3D-LOTUS's motion planning capabilities with the task planning +capabilities of LLMs and the object grounding accuracy of VLMs. 3D-LOTUS++ +achieves state-of-the-art performance on novel tasks of GemBench, setting a new +standard for generalization in robotic manipulation. The benchmark, codes and +trained models are available at +https://www.di.ens.fr/willow/research/gembench/. + +
+
+ comment: ICRA 2025 +
+
+
+
+
+ + ♻ ☆ Leveraging Vision Language Models for Specialized Agricultural Tasks WACV 2025 + + +
+ As Vision Language Models (VLMs) become increasingly accessible to farmers +and agricultural experts, there is a growing need to evaluate their potential +in specialized tasks. We present AgEval, a comprehensive benchmark for +assessing VLMs' capabilities in plant stress phenotyping, offering a solution +to the challenge of limited annotated data in agriculture. Our study explores +how general-purpose VLMs can be leveraged for domain-specific tasks with only a +few annotated examples, providing insights into their behavior and +adaptability. AgEval encompasses 12 diverse plant stress phenotyping tasks, +evaluating zero-shot and few-shot in-context learning performance of +state-of-the-art models including Claude, GPT, Gemini, and LLaVA. Our results +demonstrate VLMs' rapid adaptability to specialized tasks, with the +best-performing model showing an increase in F1 scores from 46.24% to 73.37% in +8-shot identification. To quantify performance disparities across classes, we +introduce metrics such as the coefficient of variation (CV), revealing that +VLMs' training impacts classes differently, with CV ranging from 26.02% to +58.03%. We also find that strategic example selection enhances model +reliability, with exact category examples improving F1 scores by 15.38% on +average. AgEval establishes a framework for assessing VLMs in agricultural +applications, offering valuable benchmarks for future evaluations. Our findings +suggest that VLMs, with minimal few-shot examples, show promise as a viable +alternative to traditional specialized models in plant stress phenotyping, +while also highlighting areas for further refinement. Results and benchmark +details are available at: https://github.com/arbab-ml/AgEval + +
+
+ comment: Published at WACV 2025 +
+
+
+
+
+ + ♻ ☆ ET-Former: Efficient Triplane Deformable Attention for 3D Semantic Scene + Completion From Monocular Camera + + +
+ We introduce ET-Former, a novel end-to-end algorithm for semantic scene +completion using a single monocular camera. Our approach generates a semantic +occupancy map from single RGB observation while simultaneously providing +uncertainty estimates for semantic predictions. By designing a triplane-based +deformable attention mechanism, our approach improves geometric understanding +of the scene than other SOTA approaches and reduces noise in semantic +predictions. Additionally, through the use of a Conditional Variational +AutoEncoder (CVAE), we estimate the uncertainties of these predictions. The +generated semantic and uncertainty maps will help formulate navigation +strategies that facilitate safe and permissible decision making in the future. +Evaluated on the Semantic-KITTI dataset, ET-Former achieves the highest +Intersection over Union (IoU) and mean IoU (mIoU) scores while maintaining the +lowest GPU memory usage, surpassing state-of-the-art (SOTA) methods. It +improves the SOTA scores of IoU from 44.71 to 51.49 and mIoU from 15.04 to +16.30 on SeamnticKITTI test, with a notably low training memory consumption of +10.9 GB. Project page: https://github.com/jingGM/ET-Former.git. + +
+
+
+
+
+ + ♻ ☆ Towards Hierarchical Rectified Flow ICLR 2025 + + +
+ We formulate a hierarchical rectified flow to model data distributions. It +hierarchically couples multiple ordinary differential equations (ODEs) and +defines a time-differentiable stochastic process that generates a data +distribution from a known source distribution. Each ODE resembles the ODE that +is solved in a classic rectified flow, but differs in its domain, i.e., +location, velocity, acceleration, etc. Unlike the classic rectified flow +formulation, which formulates a single ODE in the location domain and only +captures the expected velocity field (sufficient to capture a multi-modal data +distribution), the hierarchical rectified flow formulation models the +multi-modal random velocity field, acceleration field, etc., in their entirety. +This more faithful modeling of the random velocity field enables integration +paths to intersect when the underlying ODE is solved during data generation. +Intersecting paths in turn lead to integration trajectories that are more +straight than those obtained in the classic rectified flow formulation, where +integration paths cannot intersect. This leads to modeling of data +distributions with fewer neural function evaluations. We empirically verify +this on synthetic 1D and 2D data as well as MNIST, CIFAR-10, and ImageNet-32 +data. Our code is available at: https://riccizz.github.io/HRF/. + +
+
+ comment: ICLR 2025; Project Page: https://riccizz.github.io/HRF/ +
+
+
+
+
+ + ♻ ☆ DATransNet: Dynamic Attention Transformer Network for Infrared Small + Target Detection + + +
+ Infrared small target detection (ISTD) is widely used in civilian and +military applications. However, ISTD encounters several challenges, including +the tendency for small and dim targets to be obscured by complex backgrounds. +To address this issue, we propose the Dynamic Attention Transformer Network +(DATransNet), which aims to extract and preserve detailed information vital for +small targets. DATransNet employs the Dynamic Attention Transformer (DATrans), +simulating central difference convolutions (CDC) to extract gradient features. +Furthermore, we propose a global feature extraction module (GFEM) that offers a +comprehensive perspective to prevent the network from focusing solely on +details while neglecting the global information. We compare the network with +state-of-the-art (SOTA) approaches and demonstrate that our method performs +effectively. Our source code is available at +https://github.com/greekinRoma/DATransNet. + +
+
+
+
+
+ + ♻ ☆ Image Matching Filtering and Refinement by Planes and Beyond + + +
+ This paper introduces a modular, non-deep learning method for filtering and +refining sparse correspondences in image matching. Assuming that motion flow +within the scene can be approximated by local homography transformations, +matches are aggregated into overlapping clusters corresponding to virtual +planes using an iterative RANSAC-based approach, with non-conforming +correspondences discarded. Moreover, the underlying planar structural design +provides an explicit map between local patches associated with the matches, +enabling optional refinement of keypoint positions through cross-correlation +template matching after patch reprojection. Finally, to enhance robustness and +fault-tolerance against violations of the piece-wise planar approximation +assumption, a further strategy is designed for minimizing relative patch +distortion in the plane reprojection by introducing an intermediate homography +that projects both patches into a common plane. The proposed method is +extensively evaluated on standard datasets and image matching pipelines, and +compared with state-of-the-art approaches. Unlike other current comparisons, +the proposed benchmark also takes into account the more general, real, and +practical cases where camera intrinsics are unavailable. Experimental results +demonstrate that our proposed non-deep learning, geometry-based approach +achieves performances that are either superior to or on par with recent +state-of-the-art deep learning methods. Finally, this study suggests that there +are still development potential in actual image matching solutions in the +considered research direction, which could be in the future incorporated in +novel deep image matching architectures. + +
+
+ comment: project page: https://github.com/fb82/MiHo +
+
+
+
+
+ + ♻ ☆ SPA: 3D Spatial-Awareness Enables Effective Embodied Representation + + +
+ In this paper, we introduce SPA, a novel representation learning framework +that emphasizes the importance of 3D spatial awareness in embodied AI. Our +approach leverages differentiable neural rendering on multi-view images to +endow a vanilla Vision Transformer (ViT) with intrinsic spatial understanding. +We present the most comprehensive evaluation of embodied representation +learning to date, covering 268 tasks across 8 simulators with diverse policies +in both single-task and language-conditioned multi-task scenarios. The results +are compelling: SPA consistently outperforms more than 10 state-of-the-art +representation methods, including those specifically designed for embodied AI, +vision-centric tasks, and multi-modal applications, while using less training +data. Furthermore, we conduct a series of real-world experiments to confirm its +effectiveness in practical scenarios. These results highlight the critical role +of 3D spatial awareness for embodied representation learning. Our strongest +model takes more than 6000 GPU hours to train and we are committed to +open-sourcing all code and model weights to foster future research in embodied +representation learning. Project Page: https://haoyizhu.github.io/spa/. + +
+
+ comment: Project Page: https://haoyizhu.github.io/spa/ +
+
+
+
+
+ + ♻ ☆ Strip R-CNN: Large Strip Convolution for Remote Sensing Object Detection + + +
+ While witnessed with rapid development, remote sensing object detection +remains challenging for detecting high aspect ratio objects. This paper shows +that large strip convolutions are good feature representation learners for +remote sensing object detection and can detect objects of various aspect ratios +well. Based on large strip convolutions, we build a new network architecture +called Strip R-CNN, which is simple, efficient, and powerful. Unlike recent +remote sensing object detectors that leverage large-kernel convolutions with +square shapes, our Strip R-CNN takes advantage of sequential orthogonal large +strip convolutions in our backbone network StripNet to capture spatial +information. In addition, we improve the localization capability of +remote-sensing object detectors by decoupling the detection heads and equipping +the localization branch with strip convolutions in our strip head. Extensive +experiments on several benchmarks, for example DOTA, FAIR1M, HRSC2016, and +DIOR, show that our Strip R-CNN can greatly improve previous work. In +particular, our 30M model achieves 82.75% mAP on DOTA-v1.0, setting a new +state-of-the-art record. Our code will be made publicly available.Code is +available at https://github.com/YXB-NKU/Strip-R-CNN. + +
+
+
+
+
+ + ♻ ☆ MobileViM: A Light-weight and Dimension-independent Vision Mamba for 3D + Medical Image Analysis + + +
+ Efficient evaluation of three-dimensional (3D) medical images is crucial for +diagnostic and therapeutic practices in healthcare. Recent years have seen a +substantial uptake in applying deep learning and computer vision to analyse and +interpret medical images. Traditional approaches, such as convolutional neural +networks (CNNs) and vision transformers (ViTs), face significant computational +challenges, prompting the need for architectural advancements. Recent efforts +have led to the introduction of novel architectures like the ``Mamba'' model as +alternative solutions to traditional CNNs or ViTs. The Mamba model excels in +the linear processing of one-dimensional data with low computational demands. +However, Mamba's potential for 3D medical image analysis remains underexplored +and could face significant computational challenges as the dimension increases. +This manuscript presents MobileViM, a streamlined architecture for efficient +segmentation of 3D medical images. In the MobileViM network, we invent a new +dimension-independent mechanism and a dual-direction traversing approach to +incorporate with a vision-Mamba-based framework. MobileViM also features a +cross-scale bridging technique to improve efficiency and accuracy across +various medical imaging modalities. With these enhancements, MobileViM achieves +segmentation speeds exceeding 90 frames per second (FPS) on a single graphics +processing unit (i.e., NVIDIA RTX 4090). This performance is over 24 FPS faster +than the state-of-the-art deep learning models for processing 3D images with +the same computational resources. In addition, experimental evaluations +demonstrate that MobileViM delivers superior performance, with Dice similarity +scores reaching 92.72%, 86.69%, 80.46%, and 77.43% for PENGWIN, BraTS2024, +ATLAS, and Toothfairy2 datasets, respectively, which significantly surpasses +existing models. + +
+
+ comment: The co-authors have not approved its submission to arXiv +
+
+
+
+
+ + ♻ ☆ Balancing Accuracy and Efficiency for Large-Scale SLAM: A Minimal Subset + Approach for Scalable Loop Closures + + +
+ Typical LiDAR SLAM architectures feature a front-end for odometry estimation +and a back-end for refining and optimizing the trajectory and map, commonly +through loop closures. However, loop closure detection in large-scale missions +presents significant computational challenges due to the need to identify, +verify, and process numerous candidate pairs for pose graph optimization. +Keyframe sampling bridges the front-end and back-end by selecting frames for +storing and processing during global optimization. This article proposes an +online keyframe sampling approach that constructs the pose graph using the most +impactful keyframes for loop closure. We introduce the Minimal Subset Approach +(MSA), which optimizes two key objectives: redundancy minimization and +information preservation, implemented within a sliding window framework. By +operating in the feature space rather than 3-D space, MSA efficiently reduces +redundant keyframes while retaining essential information. In sum, evaluations +on diverse public datasets show that the proposed approach outperforms naive +methods in reducing false positive rates in place recognition, while delivering +superior ATE and RPE in metric localization, without the need for manual +parameter tuning. Additionally, MSA demonstrates efficiency and scalability by +reducing memory usage and computational overhead during loop closure detection +and pose graph optimization. + +
+
+ comment: 8 pages, 7 Figures, 2 Tables. Submitted +
+
+
+
+
+ + ♻ ☆ Compositional Entailment Learning for Hyperbolic Vision-Language Models ICLR 2025 + + +
+ Image-text representation learning forms a cornerstone in vision-language +models, where pairs of images and textual descriptions are contrastively +aligned in a shared embedding space. Since visual and textual concepts are +naturally hierarchical, recent work has shown that hyperbolic space can serve +as a high-potential manifold to learn vision-language representation with +strong downstream performance. In this work, for the first time we show how to +fully leverage the innate hierarchical nature of hyperbolic embeddings by +looking beyond individual image-text pairs. We propose Compositional Entailment +Learning for hyperbolic vision-language models. The idea is that an image is +not only described by a sentence but is itself a composition of multiple object +boxes, each with their own textual description. Such information can be +obtained freely by extracting nouns from sentences and using openly available +localized grounding models. We show how to hierarchically organize images, +image boxes, and their textual descriptions through contrastive and +entailment-based objectives. Empirical evaluation on a hyperbolic +vision-language model trained with millions of image-text pairs shows that the +proposed compositional learning approach outperforms conventional Euclidean +CLIP learning, as well as recent hyperbolic alternatives, with better zero-shot +and retrieval generalization and clearly stronger hierarchical performance. + +
+
+ comment: Accepted as oral paper at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Going Beyond Feature Similarity: Effective Dataset distillation based on + Class-aware Conditional Mutual Information ICLR 2025 + + +
+ Dataset distillation (DD) aims to minimize the time and memory consumption +needed for training deep neural networks on large datasets, by creating a +smaller synthetic dataset that has similar performance to that of the full real +dataset. However, current dataset distillation methods often result in +synthetic datasets that are excessively difficult for networks to learn from, +due to the compression of a substantial amount of information from the original +data through metrics measuring feature similarity, e,g., distribution matching +(DM). In this work, we introduce conditional mutual information (CMI) to assess +the class-aware complexity of a dataset and propose a novel method by +minimizing CMI. Specifically, we minimize the distillation loss while +constraining the class-aware complexity of the synthetic dataset by minimizing +its empirical CMI from the feature space of pre-trained networks, +simultaneously. Conducting on a thorough set of experiments, we show that our +method can serve as a general regularization method to existing DD methods and +improve the performance and training efficiency. + +
+
+ comment: Accepted to ICLR 2025 +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ Perceptual Visual Quality Assessment: Principles, Methods, and Future + Directions + + +
+ As multimedia services such as video streaming, video conferencing, virtual +reality (VR), and online gaming continue to expand, ensuring high perceptual +visual quality becomes a priority to maintain user satisfaction and +competitiveness. However, multimedia content undergoes various distortions +during acquisition, compression, transmission, and storage, resulting in the +degradation of experienced quality. Thus, perceptual visual quality assessment +(PVQA), which focuses on evaluating the quality of multimedia content based on +human perception, is essential for optimizing user experiences in advanced +communication systems. Several challenges are involved in the PVQA process, +including diverse characteristics of multimedia content such as image, video, +VR, point cloud, mesh, multimodality, etc., and complex distortion scenarios as +well as viewing conditions. In this paper, we first present an overview of PVQA +principles and methods. This includes both subjective methods, where users +directly rate their experiences, and objective methods, where algorithms +predict human perception based on measurable factors such as bitrate, frame +rate, and compression levels. Based on the basics of PVQA, quality predictors +for different multimedia data are then introduced. In addition to traditional +images and videos, immersive multimedia and generative artificial intelligence +(GenAI) content are also discussed. Finally, the paper concludes with a +discussion on the future directions of PVQA research. + +
+
+ comment: A tutorial and review +
+
+
+
+
+ + ☆ Unbiased Video Scene Graph Generation via Visual and Semantic Dual + Debiasing CVPR 2025 + + +
+ Video Scene Graph Generation (VidSGG) aims to capture dynamic relationships +among entities by sequentially analyzing video frames and integrating visual +and semantic information. However, VidSGG is challenged by significant biases +that skew predictions. To mitigate these biases, we propose a VIsual and +Semantic Awareness (VISA) framework for unbiased VidSGG. VISA addresses visual +bias through memory-enhanced temporal integration that enhances object +representations and concurrently reduces semantic bias by iteratively +integrating object features with comprehensive semantic information derived +from triplet relationships. This visual-semantics dual debiasing approach +results in more unbiased representations of complex scene dynamics. Extensive +experiments demonstrate the effectiveness of our method, where VISA outperforms +existing unbiased VidSGG approaches by a substantial margin (e.g., +13.1% +improvement in mR@20 and mR@50 for the SGCLS task under Semi Constraint). + +
+
+ comment: 17 pages, 8 figures, CVPR 2025 +
+
+
+
+
+ + ☆ PodAgent: A Comprehensive Framework for Podcast Generation + + +
+ Existing Existing automatic audio generation methods struggle to generate +podcast-like audio programs effectively. The key challenges lie in in-depth +content generation, appropriate and expressive voice production. This paper +proposed PodAgent, a comprehensive framework for creating audio programs. +PodAgent 1) generates informative topic-discussion content by designing a +Host-Guest-Writer multi-agent collaboration system, 2) builds a voice pool for +suitable voice-role matching and 3) utilizes LLM-enhanced speech synthesis +method to generate expressive conversational speech. Given the absence of +standardized evaluation criteria for podcast-like audio generation, we +developed comprehensive assessment guidelines to effectively evaluate the +model's performance. Experimental results demonstrate PodAgent's effectiveness, +significantly surpassing direct GPT-4 generation in topic-discussion dialogue +content, achieving an 87.4% voice-matching accuracy, and producing more +expressive speech through LLM-guided synthesis. Demo page: +https://podcast-agent.github.io/demo/. Source code: +https://github.com/yujxx/PodAgent. + +
+
+
+
+
+ + MIRROR: Multi-Modal Pathological Self-Supervised Representation Learning + via Modality Alignment and Retention + + +
+ Histopathology and transcriptomics are fundamental modalities in oncology, +encapsulating the morphological and molecular aspects of the disease. +Multi-modal self-supervised learning has demonstrated remarkable potential in +learning pathological representations by integrating diverse data sources. +Conventional multi-modal integration methods primarily emphasize modality +alignment, while paying insufficient attention to retaining the +modality-specific structures. However, unlike conventional scenarios where +multi-modal inputs share highly overlapping features, histopathology and +transcriptomics exhibit pronounced heterogeneity, offering orthogonal yet +complementary insights. Histopathology provides morphological and spatial +context, elucidating tissue architecture and cellular topology, whereas +transcriptomics delineates molecular signatures through gene expression +patterns. This inherent disparity introduces a major challenge in aligning them +while maintaining modality-specific fidelity. To address these challenges, we +present MIRROR, a novel multi-modal representation learning method designed to +foster both modality alignment and retention. MIRROR employs dedicated encoders +to extract comprehensive features for each modality, which is further +complemented by a modality alignment module to achieve seamless integration +between phenotype patterns and molecular profiles. Furthermore, a modality +retention module safeguards unique attributes from each modality, while a style +clustering module mitigates redundancy and enhances disease-relevant +information by modeling and aligning consistent pathological signatures within +a clustering space. Extensive evaluations on TCGA cohorts for cancer subtyping +and survival analysis highlight MIRROR's superior performance, demonstrating +its effectiveness in constructing comprehensive oncological feature +representations and benefiting the cancer diagnosis. + +
+
+ comment: 10 pages, 5 figures, 3 tables +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 0000000..69f5da7 --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`